In the telecom industry, customers are able to choose from multiple service providers and actively switch from one operator to another. In this highly competitive market, the telecommunications industry experiences an average of 15-25% annual churn rate.
Given the fact that it costs 5-10 times more to acquire a new customer than to retain an existing one, customer retention has now become even more important than customer acquisition.
For many incumbent operators, retaining high profitable customers is the number one business goal. To reduce customer churn, telecom companies need to predict which customers are at high risk of churn.
Our goal is to build a machine learning model that is able to predict churning customers based on the features provided for their usage.
The steps that we have follwoed below are as follows:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# Data manipulation libraries
import pandas as pd
import numpy as np
import missingno as msno
# Setting Display Options
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option('display.width', None)
# Data visualization libraries
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as pyo
from plotly.subplots import make_subplots
# Model building
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
telecom = pd.read_csv('train.csv')
telecom.head()
| id | circle_id | loc_og_t2o_mou | std_og_t2o_mou | loc_ic_t2o_mou | last_date_of_month_6 | last_date_of_month_7 | last_date_of_month_8 | arpu_6 | arpu_7 | arpu_8 | onnet_mou_6 | onnet_mou_7 | onnet_mou_8 | offnet_mou_6 | offnet_mou_7 | offnet_mou_8 | roam_ic_mou_6 | roam_ic_mou_7 | roam_ic_mou_8 | roam_og_mou_6 | roam_og_mou_7 | roam_og_mou_8 | loc_og_t2t_mou_6 | loc_og_t2t_mou_7 | loc_og_t2t_mou_8 | loc_og_t2m_mou_6 | loc_og_t2m_mou_7 | loc_og_t2m_mou_8 | loc_og_t2f_mou_6 | loc_og_t2f_mou_7 | loc_og_t2f_mou_8 | loc_og_t2c_mou_6 | loc_og_t2c_mou_7 | loc_og_t2c_mou_8 | loc_og_mou_6 | loc_og_mou_7 | loc_og_mou_8 | std_og_t2t_mou_6 | std_og_t2t_mou_7 | std_og_t2t_mou_8 | std_og_t2m_mou_6 | std_og_t2m_mou_7 | std_og_t2m_mou_8 | std_og_t2f_mou_6 | std_og_t2f_mou_7 | std_og_t2f_mou_8 | std_og_t2c_mou_6 | std_og_t2c_mou_7 | std_og_t2c_mou_8 | std_og_mou_6 | std_og_mou_7 | std_og_mou_8 | isd_og_mou_6 | isd_og_mou_7 | isd_og_mou_8 | spl_og_mou_6 | spl_og_mou_7 | spl_og_mou_8 | og_others_6 | og_others_7 | og_others_8 | total_og_mou_6 | total_og_mou_7 | total_og_mou_8 | loc_ic_t2t_mou_6 | loc_ic_t2t_mou_7 | loc_ic_t2t_mou_8 | loc_ic_t2m_mou_6 | loc_ic_t2m_mou_7 | loc_ic_t2m_mou_8 | loc_ic_t2f_mou_6 | loc_ic_t2f_mou_7 | loc_ic_t2f_mou_8 | loc_ic_mou_6 | loc_ic_mou_7 | loc_ic_mou_8 | std_ic_t2t_mou_6 | std_ic_t2t_mou_7 | std_ic_t2t_mou_8 | std_ic_t2m_mou_6 | std_ic_t2m_mou_7 | std_ic_t2m_mou_8 | std_ic_t2f_mou_6 | std_ic_t2f_mou_7 | std_ic_t2f_mou_8 | std_ic_t2o_mou_6 | std_ic_t2o_mou_7 | std_ic_t2o_mou_8 | std_ic_mou_6 | std_ic_mou_7 | std_ic_mou_8 | total_ic_mou_6 | total_ic_mou_7 | total_ic_mou_8 | spl_ic_mou_6 | spl_ic_mou_7 | spl_ic_mou_8 | isd_ic_mou_6 | isd_ic_mou_7 | isd_ic_mou_8 | ic_others_6 | ic_others_7 | ic_others_8 | total_rech_num_6 | total_rech_num_7 | total_rech_num_8 | total_rech_amt_6 | total_rech_amt_7 | total_rech_amt_8 | max_rech_amt_6 | max_rech_amt_7 | max_rech_amt_8 | date_of_last_rech_6 | date_of_last_rech_7 | date_of_last_rech_8 | last_day_rch_amt_6 | last_day_rch_amt_7 | last_day_rch_amt_8 | date_of_last_rech_data_6 | date_of_last_rech_data_7 | date_of_last_rech_data_8 | total_rech_data_6 | total_rech_data_7 | total_rech_data_8 | max_rech_data_6 | max_rech_data_7 | max_rech_data_8 | count_rech_2g_6 | count_rech_2g_7 | count_rech_2g_8 | count_rech_3g_6 | count_rech_3g_7 | count_rech_3g_8 | av_rech_amt_data_6 | av_rech_amt_data_7 | av_rech_amt_data_8 | vol_2g_mb_6 | vol_2g_mb_7 | vol_2g_mb_8 | vol_3g_mb_6 | vol_3g_mb_7 | vol_3g_mb_8 | arpu_3g_6 | arpu_3g_7 | arpu_3g_8 | arpu_2g_6 | arpu_2g_7 | arpu_2g_8 | night_pck_user_6 | night_pck_user_7 | night_pck_user_8 | monthly_2g_6 | monthly_2g_7 | monthly_2g_8 | sachet_2g_6 | sachet_2g_7 | sachet_2g_8 | monthly_3g_6 | monthly_3g_7 | monthly_3g_8 | sachet_3g_6 | sachet_3g_7 | sachet_3g_8 | fb_user_6 | fb_user_7 | fb_user_8 | aon | aug_vbc_3g | jul_vbc_3g | jun_vbc_3g | churn_probability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 31.277 | 87.009 | 7.527 | 48.58 | 124.38 | 1.29 | 32.24 | 96.68 | 2.33 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.00 | 2.23 | 0.00 | 0.28 | 5.29 | 16.04 | 2.33 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 7.53 | 16.04 | 2.61 | 46.34 | 124.38 | 1.01 | 18.75 | 80.61 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 65.09 | 204.99 | 1.01 | 0.0 | 0.0 | 0.0 | 8.20 | 0.63 | 0.00 | 0.38 | 0.0 | 0.0 | 81.21 | 221.68 | 3.63 | 2.43 | 3.68 | 7.79 | 0.83 | 21.08 | 16.91 | 0.00 | 0.00 | 0.00 | 3.26 | 24.76 | 24.71 | 0.00 | 7.61 | 0.21 | 7.46 | 19.96 | 14.96 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.46 | 27.58 | 15.18 | 11.84 | 53.04 | 40.56 | 0.0 | 0.0 | 0.66 | 0.0 | 0.0 | 0.0 | 1.11 | 0.69 | 0.00 | 3 | 2 | 2 | 77 | 65 | 10 | 65 | 65 | 10 | 6/22/2014 | 7/10/2014 | 8/24/2014 | 65 | 65 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 1958 | 0.0 | 0.0 | 0.0 | 0 |
| 1 | 1 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 0.000 | 122.787 | 42.953 | 0.00 | 0.00 | 0.00 | 0.00 | 25.99 | 30.89 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 22.01 | 29.79 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 30.73 | 31.66 | 0.00 | 0.0 | 0.0 | 0.00 | 30.73 | 31.66 | 1.68 | 19.09 | 10.53 | 1.41 | 18.68 | 11.09 | 0.35 | 1.66 | 3.40 | 3.44 | 39.44 | 25.03 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 3.44 | 39.44 | 25.04 | 0.0 | 0.0 | 0.01 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 3 | 4 | 5 | 0 | 145 | 50 | 0 | 145 | 50 | 6/12/2014 | 7/10/2014 | 8/26/2014 | 0 | 0 | 0 | NaN | 7/8/2014 | NaN | NaN | 1.0 | NaN | NaN | 145.0 | NaN | NaN | 0.0 | NaN | NaN | 1.0 | NaN | NaN | 145.0 | NaN | 0.0 | 352.91 | 0.00 | 0.0 | 3.96 | 0.0 | NaN | 122.07 | NaN | NaN | 122.08 | NaN | NaN | 0.0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NaN | 1.0 | NaN | 710 | 0.0 | 0.0 | 0.0 | 0 |
| 2 | 2 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 60.806 | 103.176 | 0.000 | 0.53 | 15.93 | 0.00 | 53.99 | 82.05 | 0.00 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.00 | 0.53 | 12.98 | 0.00 | 24.11 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 2.14 | 0.00 | 0.00 | 24.64 | 12.98 | 0.00 | 0.00 | 2.94 | 0.00 | 28.94 | 82.05 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 28.94 | 84.99 | 0.00 | 0.0 | 0.0 | 0.0 | 2.89 | 1.38 | 0.00 | 0.00 | 0.0 | 0.0 | 56.49 | 99.36 | 0.00 | 4.51 | 6.16 | 6.49 | 89.86 | 25.18 | 23.51 | 0.00 | 0.00 | 0.00 | 94.38 | 31.34 | 30.01 | 11.69 | 0.00 | 0.00 | 18.21 | 2.48 | 6.38 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 29.91 | 2.48 | 6.38 | 124.29 | 33.83 | 36.64 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.25 | 2 | 4 | 2 | 70 | 120 | 0 | 70 | 70 | 0 | 6/11/2014 | 7/22/2014 | 8/24/2014 | 70 | 50 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 882 | 0.0 | 0.0 | 0.0 | 0 |
| 3 | 3 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 156.362 | 205.260 | 111.095 | 7.26 | 16.01 | 0.00 | 68.76 | 78.48 | 50.23 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 1.63 | 6.99 | 3.94 | 0.00 | 37.91 | 44.89 | 23.63 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 8.03 | 44.91 | 48.84 | 23.63 | 0.26 | 12.06 | 0.00 | 15.33 | 25.93 | 4.6 | 0.56 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 16.16 | 37.99 | 4.60 | 0.0 | 0.0 | 0.0 | 14.95 | 9.13 | 25.61 | 0.00 | 0.0 | 0.0 | 76.03 | 95.98 | 53.84 | 24.98 | 4.84 | 23.88 | 53.99 | 44.23 | 57.14 | 7.23 | 0.81 | 0.00 | 86.21 | 49.89 | 81.03 | 0.00 | 0.00 | 0.00 | 8.89 | 0.28 | 2.81 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.89 | 0.28 | 2.81 | 95.11 | 50.18 | 83.84 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 2 | 4 | 3 | 160 | 240 | 130 | 110 | 110 | 50 | 6/15/2014 | 7/21/2014 | 8/25/2014 | 110 | 110 | 50 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 982 | 0.0 | 0.0 | 0.0 | 0 |
| 4 | 4 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 240.708 | 128.191 | 101.565 | 21.28 | 4.83 | 6.13 | 56.99 | 38.11 | 9.63 | 53.64 | 0.0 | 0.0 | 15.73 | 0.0 | 0.00 | 10.16 | 4.83 | 6.13 | 36.74 | 19.88 | 4.61 | 11.99 | 1.23 | 5.01 | 0.00 | 9.85 | 0.00 | 58.91 | 25.94 | 15.76 | 0.00 | 0.00 | 0.00 | 4.35 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.35 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 17.00 | 0.00 | 0.00 | 0.0 | 0.0 | 63.26 | 42.94 | 15.76 | 5.44 | 1.39 | 2.66 | 10.58 | 4.33 | 19.49 | 5.51 | 3.63 | 6.14 | 21.54 | 9.36 | 28.31 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 21.54 | 9.36 | 28.31 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 13 | 10 | 8 | 290 | 136 | 122 | 50 | 41 | 30 | 6/25/2014 | 7/26/2014 | 8/30/2014 | 25 | 10 | 30 | 6/25/2014 | 7/23/2014 | 8/20/2014 | 7.0 | 7.0 | 6.0 | 25.0 | 41.0 | 25.0 | 7.0 | 6.0 | 6.0 | 0.0 | 1.0 | 0.0 | 175.0 | 191.0 | 142.0 | 390.8 | 308.89 | 213.47 | 0.0 | 0.00 | 0.0 | 0.0 | 35.00 | 0.0 | 0.0 | 35.12 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 7 | 6 | 6 | 0 | 0 | 0 | 0 | 1 | 0 | 1.0 | 1.0 | 1.0 | 647 | 0.0 | 0.0 | 0.0 | 0 |
# How many rows and columns are present?
telecom.shape
(69999, 172)
The dataset has 172 columns and 69999 rows of data. Since we have 172 columns, let's remove columns that we do not need and then go ahead and further understand the data.
# What are the datatypes involved?
telecom.info(verbose=1)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 69999 entries, 0 to 69998 Data columns (total 172 columns): # Column Dtype --- ------ ----- 0 id int64 1 circle_id int64 2 loc_og_t2o_mou float64 3 std_og_t2o_mou float64 4 loc_ic_t2o_mou float64 5 last_date_of_month_6 object 6 last_date_of_month_7 object 7 last_date_of_month_8 object 8 arpu_6 float64 9 arpu_7 float64 10 arpu_8 float64 11 onnet_mou_6 float64 12 onnet_mou_7 float64 13 onnet_mou_8 float64 14 offnet_mou_6 float64 15 offnet_mou_7 float64 16 offnet_mou_8 float64 17 roam_ic_mou_6 float64 18 roam_ic_mou_7 float64 19 roam_ic_mou_8 float64 20 roam_og_mou_6 float64 21 roam_og_mou_7 float64 22 roam_og_mou_8 float64 23 loc_og_t2t_mou_6 float64 24 loc_og_t2t_mou_7 float64 25 loc_og_t2t_mou_8 float64 26 loc_og_t2m_mou_6 float64 27 loc_og_t2m_mou_7 float64 28 loc_og_t2m_mou_8 float64 29 loc_og_t2f_mou_6 float64 30 loc_og_t2f_mou_7 float64 31 loc_og_t2f_mou_8 float64 32 loc_og_t2c_mou_6 float64 33 loc_og_t2c_mou_7 float64 34 loc_og_t2c_mou_8 float64 35 loc_og_mou_6 float64 36 loc_og_mou_7 float64 37 loc_og_mou_8 float64 38 std_og_t2t_mou_6 float64 39 std_og_t2t_mou_7 float64 40 std_og_t2t_mou_8 float64 41 std_og_t2m_mou_6 float64 42 std_og_t2m_mou_7 float64 43 std_og_t2m_mou_8 float64 44 std_og_t2f_mou_6 float64 45 std_og_t2f_mou_7 float64 46 std_og_t2f_mou_8 float64 47 std_og_t2c_mou_6 float64 48 std_og_t2c_mou_7 float64 49 std_og_t2c_mou_8 float64 50 std_og_mou_6 float64 51 std_og_mou_7 float64 52 std_og_mou_8 float64 53 isd_og_mou_6 float64 54 isd_og_mou_7 float64 55 isd_og_mou_8 float64 56 spl_og_mou_6 float64 57 spl_og_mou_7 float64 58 spl_og_mou_8 float64 59 og_others_6 float64 60 og_others_7 float64 61 og_others_8 float64 62 total_og_mou_6 float64 63 total_og_mou_7 float64 64 total_og_mou_8 float64 65 loc_ic_t2t_mou_6 float64 66 loc_ic_t2t_mou_7 float64 67 loc_ic_t2t_mou_8 float64 68 loc_ic_t2m_mou_6 float64 69 loc_ic_t2m_mou_7 float64 70 loc_ic_t2m_mou_8 float64 71 loc_ic_t2f_mou_6 float64 72 loc_ic_t2f_mou_7 float64 73 loc_ic_t2f_mou_8 float64 74 loc_ic_mou_6 float64 75 loc_ic_mou_7 float64 76 loc_ic_mou_8 float64 77 std_ic_t2t_mou_6 float64 78 std_ic_t2t_mou_7 float64 79 std_ic_t2t_mou_8 float64 80 std_ic_t2m_mou_6 float64 81 std_ic_t2m_mou_7 float64 82 std_ic_t2m_mou_8 float64 83 std_ic_t2f_mou_6 float64 84 std_ic_t2f_mou_7 float64 85 std_ic_t2f_mou_8 float64 86 std_ic_t2o_mou_6 float64 87 std_ic_t2o_mou_7 float64 88 std_ic_t2o_mou_8 float64 89 std_ic_mou_6 float64 90 std_ic_mou_7 float64 91 std_ic_mou_8 float64 92 total_ic_mou_6 float64 93 total_ic_mou_7 float64 94 total_ic_mou_8 float64 95 spl_ic_mou_6 float64 96 spl_ic_mou_7 float64 97 spl_ic_mou_8 float64 98 isd_ic_mou_6 float64 99 isd_ic_mou_7 float64 100 isd_ic_mou_8 float64 101 ic_others_6 float64 102 ic_others_7 float64 103 ic_others_8 float64 104 total_rech_num_6 int64 105 total_rech_num_7 int64 106 total_rech_num_8 int64 107 total_rech_amt_6 int64 108 total_rech_amt_7 int64 109 total_rech_amt_8 int64 110 max_rech_amt_6 int64 111 max_rech_amt_7 int64 112 max_rech_amt_8 int64 113 date_of_last_rech_6 object 114 date_of_last_rech_7 object 115 date_of_last_rech_8 object 116 last_day_rch_amt_6 int64 117 last_day_rch_amt_7 int64 118 last_day_rch_amt_8 int64 119 date_of_last_rech_data_6 object 120 date_of_last_rech_data_7 object 121 date_of_last_rech_data_8 object 122 total_rech_data_6 float64 123 total_rech_data_7 float64 124 total_rech_data_8 float64 125 max_rech_data_6 float64 126 max_rech_data_7 float64 127 max_rech_data_8 float64 128 count_rech_2g_6 float64 129 count_rech_2g_7 float64 130 count_rech_2g_8 float64 131 count_rech_3g_6 float64 132 count_rech_3g_7 float64 133 count_rech_3g_8 float64 134 av_rech_amt_data_6 float64 135 av_rech_amt_data_7 float64 136 av_rech_amt_data_8 float64 137 vol_2g_mb_6 float64 138 vol_2g_mb_7 float64 139 vol_2g_mb_8 float64 140 vol_3g_mb_6 float64 141 vol_3g_mb_7 float64 142 vol_3g_mb_8 float64 143 arpu_3g_6 float64 144 arpu_3g_7 float64 145 arpu_3g_8 float64 146 arpu_2g_6 float64 147 arpu_2g_7 float64 148 arpu_2g_8 float64 149 night_pck_user_6 float64 150 night_pck_user_7 float64 151 night_pck_user_8 float64 152 monthly_2g_6 int64 153 monthly_2g_7 int64 154 monthly_2g_8 int64 155 sachet_2g_6 int64 156 sachet_2g_7 int64 157 sachet_2g_8 int64 158 monthly_3g_6 int64 159 monthly_3g_7 int64 160 monthly_3g_8 int64 161 sachet_3g_6 int64 162 sachet_3g_7 int64 163 sachet_3g_8 int64 164 fb_user_6 float64 165 fb_user_7 float64 166 fb_user_8 float64 167 aon int64 168 aug_vbc_3g float64 169 jul_vbc_3g float64 170 jun_vbc_3g float64 171 churn_probability int64 dtypes: float64(135), int64(28), object(9) memory usage: 91.9+ MB
# Check the dataspread
telecom.describe()
| id | circle_id | loc_og_t2o_mou | std_og_t2o_mou | loc_ic_t2o_mou | arpu_6 | arpu_7 | arpu_8 | onnet_mou_6 | onnet_mou_7 | onnet_mou_8 | offnet_mou_6 | offnet_mou_7 | offnet_mou_8 | roam_ic_mou_6 | roam_ic_mou_7 | roam_ic_mou_8 | roam_og_mou_6 | roam_og_mou_7 | roam_og_mou_8 | loc_og_t2t_mou_6 | loc_og_t2t_mou_7 | loc_og_t2t_mou_8 | loc_og_t2m_mou_6 | loc_og_t2m_mou_7 | loc_og_t2m_mou_8 | loc_og_t2f_mou_6 | loc_og_t2f_mou_7 | loc_og_t2f_mou_8 | loc_og_t2c_mou_6 | loc_og_t2c_mou_7 | loc_og_t2c_mou_8 | loc_og_mou_6 | loc_og_mou_7 | loc_og_mou_8 | std_og_t2t_mou_6 | std_og_t2t_mou_7 | std_og_t2t_mou_8 | std_og_t2m_mou_6 | std_og_t2m_mou_7 | std_og_t2m_mou_8 | std_og_t2f_mou_6 | std_og_t2f_mou_7 | std_og_t2f_mou_8 | std_og_t2c_mou_6 | std_og_t2c_mou_7 | std_og_t2c_mou_8 | std_og_mou_6 | std_og_mou_7 | std_og_mou_8 | isd_og_mou_6 | isd_og_mou_7 | isd_og_mou_8 | spl_og_mou_6 | spl_og_mou_7 | spl_og_mou_8 | og_others_6 | og_others_7 | og_others_8 | total_og_mou_6 | total_og_mou_7 | total_og_mou_8 | loc_ic_t2t_mou_6 | loc_ic_t2t_mou_7 | loc_ic_t2t_mou_8 | loc_ic_t2m_mou_6 | loc_ic_t2m_mou_7 | loc_ic_t2m_mou_8 | loc_ic_t2f_mou_6 | loc_ic_t2f_mou_7 | loc_ic_t2f_mou_8 | loc_ic_mou_6 | loc_ic_mou_7 | loc_ic_mou_8 | std_ic_t2t_mou_6 | std_ic_t2t_mou_7 | std_ic_t2t_mou_8 | std_ic_t2m_mou_6 | std_ic_t2m_mou_7 | std_ic_t2m_mou_8 | std_ic_t2f_mou_6 | std_ic_t2f_mou_7 | std_ic_t2f_mou_8 | std_ic_t2o_mou_6 | std_ic_t2o_mou_7 | std_ic_t2o_mou_8 | std_ic_mou_6 | std_ic_mou_7 | std_ic_mou_8 | total_ic_mou_6 | total_ic_mou_7 | total_ic_mou_8 | spl_ic_mou_6 | spl_ic_mou_7 | spl_ic_mou_8 | isd_ic_mou_6 | isd_ic_mou_7 | isd_ic_mou_8 | ic_others_6 | ic_others_7 | ic_others_8 | total_rech_num_6 | total_rech_num_7 | total_rech_num_8 | total_rech_amt_6 | total_rech_amt_7 | total_rech_amt_8 | max_rech_amt_6 | max_rech_amt_7 | max_rech_amt_8 | last_day_rch_amt_6 | last_day_rch_amt_7 | last_day_rch_amt_8 | total_rech_data_6 | total_rech_data_7 | total_rech_data_8 | max_rech_data_6 | max_rech_data_7 | max_rech_data_8 | count_rech_2g_6 | count_rech_2g_7 | count_rech_2g_8 | count_rech_3g_6 | count_rech_3g_7 | count_rech_3g_8 | av_rech_amt_data_6 | av_rech_amt_data_7 | av_rech_amt_data_8 | vol_2g_mb_6 | vol_2g_mb_7 | vol_2g_mb_8 | vol_3g_mb_6 | vol_3g_mb_7 | vol_3g_mb_8 | arpu_3g_6 | arpu_3g_7 | arpu_3g_8 | arpu_2g_6 | arpu_2g_7 | arpu_2g_8 | night_pck_user_6 | night_pck_user_7 | night_pck_user_8 | monthly_2g_6 | monthly_2g_7 | monthly_2g_8 | sachet_2g_6 | sachet_2g_7 | sachet_2g_8 | monthly_3g_6 | monthly_3g_7 | monthly_3g_8 | sachet_3g_6 | sachet_3g_7 | sachet_3g_8 | fb_user_6 | fb_user_7 | fb_user_8 | aon | aug_vbc_3g | jul_vbc_3g | jun_vbc_3g | churn_probability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 69999.000000 | 69999.0 | 69297.0 | 69297.0 | 69297.0 | 69999.000000 | 69999.000000 | 69999.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.0 | 67312.0 | 66296.0 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.0 | 67312.0 | 66296.0 | 67231.000000 | 67312.000000 | 66296.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 67231.000000 | 67312.000000 | 66296.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 17568.000000 | 17865.000000 | 18417.000000 | 69999.000000 | 69999.000000 | 69999.000000 | 69999.00000 | 69999.000000 |
| mean | 34999.000000 | 109.0 | 0.0 | 0.0 | 0.0 | 283.134365 | 278.185912 | 278.858826 | 133.153275 | 133.894438 | 132.978257 | 198.874771 | 197.153383 | 196.543577 | 9.765435 | 7.014568 | 7.004892 | 14.186457 | 9.842191 | 9.771783 | 46.904854 | 46.166503 | 45.686109 | 93.238231 | 90.799240 | 91.121447 | 3.743179 | 3.777031 | 3.661652 | 1.126025 | 1.361052 | 1.420840 | 143.893585 | 140.750120 | 140.476486 | 80.619382 | 83.775851 | 83.471486 | 88.152110 | 91.538615 | 90.586999 | 1.126377 | 1.084062 | 1.057739 | 0.0 | 0.0 | 0.0 | 169.900601 | 176.401217 | 175.118852 | 0.845763 | 0.811100 | 0.841648 | 3.958619 | 4.976783 | 5.045027 | 0.462581 | 0.024425 | 0.033059 | 306.451436 | 310.572674 | 304.513065 | 48.043255 | 47.882736 | 47.256388 | 107.152439 | 106.489856 | 108.154731 | 12.050672 | 12.563665 | 11.716763 | 167.255126 | 166.945103 | 167.136761 | 9.476958 | 9.873468 | 9.910217 | 20.734858 | 21.685359 | 21.089042 | 2.146273 | 2.199395 | 2.075179 | 0.0 | 0.0 | 0.0 | 32.360632 | 33.760809 | 33.077030 | 199.710640 | 201.878029 | 198.486034 | 0.061932 | 0.033371 | 0.040392 | 7.394167 | 8.171162 | 8.348424 | 0.854063 | 1.019680 | 0.963214 | 7.566522 | 7.706667 | 7.224932 | 328.139788 | 322.376363 | 323.846355 | 104.569265 | 104.137573 | 107.540351 | 63.426949 | 59.294218 | 62.489478 | 2.467612 | 2.679989 | 2.652441 | 126.500000 | 126.402071 | 125.374925 | 1.865323 | 2.056311 | 2.016018 | 0.602288 | 0.623678 | 0.636423 | 192.831096 | 201.455940 | 196.815792 | 51.773924 | 51.240204 | 50.127506 | 122.171882 | 128.934444 | 135.486541 | 90.069931 | 89.115767 | 90.618564 | 86.863900 | 85.846074 | 86.348404 | 0.025273 | 0.024069 | 0.021013 | 0.079287 | 0.083401 | 0.080930 | 0.388863 | 0.441406 | 0.449492 | 0.075815 | 0.077730 | 0.081958 | 0.075344 | 0.081444 | 0.085487 | 0.916325 | 0.909544 | 0.890319 | 1220.639709 | 68.108597 | 65.935830 | 60.07674 | 0.101887 |
| std | 20207.115084 | 0.0 | 0.0 | 0.0 | 0.0 | 334.213918 | 344.366927 | 351.924315 | 299.963093 | 311.277193 | 311.896596 | 316.818355 | 322.482226 | 324.089234 | 57.374429 | 55.960985 | 53.408135 | 73.469261 | 58.511894 | 64.618388 | 150.971758 | 154.739002 | 153.716880 | 162.046699 | 153.852597 | 152.997805 | 13.319542 | 13.568110 | 13.009193 | 5.741811 | 7.914113 | 6.542202 | 252.034597 | 246.313148 | 245.342359 | 255.098355 | 266.693254 | 267.021929 | 255.771554 | 267.532089 | 270.032002 | 8.136645 | 8.325206 | 7.696853 | 0.0 | 0.0 | 0.0 | 392.046600 | 409.299501 | 410.697098 | 29.747486 | 29.220073 | 29.563367 | 15.854529 | 22.229842 | 17.708507 | 4.768437 | 1.716430 | 2.232547 | 465.502866 | 479.131770 | 477.936832 | 140.499757 | 147.761124 | 141.249368 | 168.455999 | 165.452459 | 166.223461 | 39.416076 | 43.495179 | 38.606895 | 252.576231 | 254.688718 | 249.288410 | 51.664472 | 56.137824 | 54.248186 | 80.294236 | 87.314510 | 81.534344 | 16.522232 | 16.171533 | 15.865403 | 0.0 | 0.0 | 0.0 | 104.381082 | 114.142230 | 108.469864 | 290.114823 | 296.771338 | 288.336731 | 0.164823 | 0.137322 | 0.148417 | 60.951165 | 63.604165 | 63.097570 | 12.149144 | 13.225373 | 11.697686 | 7.041452 | 7.050614 | 7.195597 | 404.211068 | 411.070120 | 426.181405 | 121.407701 | 120.782543 | 124.396750 | 97.954876 | 95.429492 | 101.996729 | 2.794610 | 3.073472 | 3.101265 | 109.352573 | 109.459266 | 109.648799 | 2.566377 | 2.799916 | 2.728246 | 1.279297 | 1.401230 | 1.457058 | 190.623115 | 198.346141 | 192.280532 | 212.513909 | 211.114667 | 213.101403 | 554.869965 | 554.096072 | 568.310234 | 193.600413 | 195.826990 | 189.907986 | 171.321203 | 178.067280 | 170.297094 | 0.156958 | 0.153269 | 0.143432 | 0.294719 | 0.304802 | 0.299254 | 1.494206 | 1.651012 | 1.632450 | 0.358905 | 0.383189 | 0.381821 | 0.573003 | 0.634547 | 0.680035 | 0.276907 | 0.286842 | 0.312501 | 952.426321 | 269.328659 | 267.899034 | 257.22681 | 0.302502 |
| min | 0.000000 | 109.0 | 0.0 | 0.0 | 0.0 | -2258.709000 | -1289.715000 | -945.808000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -20.380000 | -26.040000 | -24.490000 | -35.830000 | -13.090000 | -55.830000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 180.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 |
| 25% | 17499.500000 | 109.0 | 0.0 | 0.0 | 0.0 | 93.581000 | 86.714000 | 84.095000 | 7.410000 | 6.675000 | 6.410000 | 34.860000 | 32.240000 | 31.575000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.660000 | 1.650000 | 1.610000 | 9.920000 | 10.090000 | 9.830000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.235000 | 17.590000 | 17.237500 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 44.780000 | 42.910000 | 38.710000 | 3.030000 | 3.260000 | 3.280000 | 17.390000 | 18.610000 | 18.940000 | 0.000000 | 0.000000 | 0.000000 | 30.630000 | 32.710000 | 32.810000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.030000 | 38.640000 | 41.340000 | 38.290000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 | 3.000000 | 3.000000 | 110.000000 | 100.000000 | 90.000000 | 30.000000 | 30.000000 | 30.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 25.000000 | 25.000000 | 25.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 82.000000 | 92.000000 | 84.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 468.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 |
| 50% | 34999.000000 | 109.0 | 0.0 | 0.0 | 0.0 | 197.484000 | 191.588000 | 192.234000 | 34.110000 | 32.280000 | 32.100000 | 96.480000 | 91.885000 | 91.800000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 11.910000 | 11.580000 | 11.740000 | 41.030000 | 40.170000 | 40.350000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 65.190000 | 63.430000 | 63.520000 | 0.000000 | 0.000000 | 0.000000 | 3.980000 | 3.710000 | 3.300000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 11.730000 | 11.260000 | 10.505000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 145.280000 | 141.230000 | 138.360000 | 15.740000 | 15.830000 | 16.040000 | 56.460000 | 56.930000 | 58.210000 | 0.880000 | 0.910000 | 0.930000 | 92.430000 | 92.510000 | 93.890000 | 0.000000 | 0.000000 | 0.000000 | 2.040000 | 2.060000 | 2.030000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 5.910000 | 5.980000 | 5.830000 | 114.780000 | 116.330000 | 114.610000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 6.000000 | 5.000000 | 229.000000 | 220.000000 | 225.000000 | 110.000000 | 110.000000 | 98.000000 | 30.000000 | 30.000000 | 30.000000 | 1.000000 | 2.000000 | 1.000000 | 145.000000 | 145.000000 | 145.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 154.000000 | 154.000000 | 154.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.520000 | 0.420000 | 0.840000 | 11.300000 | 8.800000 | 9.090000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 868.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 |
| 75% | 52498.500000 | 109.0 | 0.0 | 0.0 | 0.0 | 370.791000 | 365.369500 | 369.909000 | 119.390000 | 115.837500 | 115.060000 | 232.990000 | 227.630000 | 229.345000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 40.740000 | 39.760000 | 39.895000 | 110.430000 | 107.540000 | 109.245000 | 2.060000 | 2.080000 | 2.030000 | 0.000000 | 0.000000 | 0.000000 | 167.880000 | 163.932500 | 165.615000 | 31.020000 | 31.300000 | 30.760000 | 53.745000 | 54.640000 | 52.660000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 146.335000 | 151.645000 | 149.015000 | 0.000000 | 0.000000 | 0.000000 | 2.400000 | 3.660000 | 4.002500 | 0.000000 | 0.000000 | 0.000000 | 374.305000 | 380.045000 | 370.895000 | 46.980000 | 45.690000 | 46.280000 | 132.020000 | 131.010000 | 134.380000 | 8.140000 | 8.230000 | 8.090000 | 208.325000 | 205.530000 | 208.060000 | 4.060000 | 4.180000 | 4.052500 | 14.960000 | 15.830000 | 15.310000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 26.780000 | 28.160000 | 27.615000 | 251.070000 | 249.470000 | 249.710000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 10.000000 | 9.000000 | 438.000000 | 430.000000 | 436.000000 | 120.000000 | 128.000000 | 144.000000 | 110.000000 | 110.000000 | 130.000000 | 3.000000 | 3.000000 | 3.000000 | 177.000000 | 177.000000 | 179.000000 | 2.000000 | 2.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 252.000000 | 252.000000 | 252.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 122.070000 | 120.860000 | 122.070000 | 122.070000 | 122.070000 | 122.070000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1813.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 |
| max | 69998.000000 | 109.0 | 0.0 | 0.0 | 0.0 | 27731.088000 | 35145.834000 | 33543.624000 | 7376.710000 | 8157.780000 | 10752.560000 | 8362.360000 | 7043.980000 | 14007.340000 | 2850.980000 | 4155.830000 | 4169.810000 | 3775.110000 | 2812.040000 | 5337.040000 | 6431.330000 | 7400.660000 | 10752.560000 | 4696.830000 | 4557.140000 | 4961.330000 | 617.580000 | 815.330000 | 588.290000 | 342.860000 | 916.240000 | 351.830000 | 10643.380000 | 7674.780000 | 11039.910000 | 7366.580000 | 8133.660000 | 8014.430000 | 8314.760000 | 6622.540000 | 13950.040000 | 628.560000 | 465.790000 | 354.160000 | 0.0 | 0.0 | 0.0 | 8432.990000 | 8155.530000 | 13980.060000 | 5900.660000 | 5490.280000 | 5681.540000 | 1023.210000 | 2372.510000 | 1075.080000 | 800.890000 | 270.240000 | 394.930000 | 10674.030000 | 8285.640000 | 14043.060000 | 5315.590000 | 9324.660000 | 10696.230000 | 4450.740000 | 4455.830000 | 6274.190000 | 1872.340000 | 1983.010000 | 1676.580000 | 7454.630000 | 9669.910000 | 10830.160000 | 3336.380000 | 4708.710000 | 3930.240000 | 5647.160000 | 6141.880000 | 5512.760000 | 1351.110000 | 1136.080000 | 1394.890000 | 0.0 | 0.0 | 0.0 | 5712.110000 | 6745.760000 | 5658.740000 | 7716.140000 | 9699.010000 | 10830.380000 | 19.760000 | 13.460000 | 16.860000 | 6789.410000 | 5289.540000 | 4127.010000 | 1362.940000 | 1495.940000 | 1209.860000 | 170.000000 | 138.000000 | 138.000000 | 35190.000000 | 40335.000000 | 45320.000000 | 4010.000000 | 3299.000000 | 4449.000000 | 4010.000000 | 3100.000000 | 4449.000000 | 61.000000 | 54.000000 | 60.000000 | 1555.000000 | 1555.000000 | 1555.000000 | 42.000000 | 48.000000 | 44.000000 | 29.000000 | 34.000000 | 45.000000 | 5920.000000 | 4365.000000 | 4076.000000 | 10285.900000 | 7873.550000 | 11117.610000 | 45735.400000 | 28144.120000 | 30036.060000 | 5054.370000 | 4980.900000 | 3716.900000 | 5054.350000 | 4809.360000 | 3483.170000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 5.000000 | 5.000000 | 42.000000 | 48.000000 | 44.000000 | 9.000000 | 16.000000 | 16.000000 | 29.000000 | 33.000000 | 41.000000 | 1.000000 | 1.000000 | 1.000000 | 4337.000000 | 12916.220000 | 9165.600000 | 11166.21000 | 1.000000 |
A lot of outliers, negative values and different scales of data can be observed.
# Recoding month abbreviations to numbers so that all columns representing months are uniformly named
months = ['aug_vbc_3g','jul_vbc_3g','jun_vbc_3g']
telecom = telecom.rename(columns = {'aug_vbc_3g':'3g_vbc_8','jul_vbc_3g':'3g_vbc_7','jun_vbc_3g':'3g_vbc_6'})
We define high value customers as those customers whose avergae total recharge amount is in the 70th percentile.
#Impute null with 0 - indicates recharge was not done
telecom['total_rech_data_6'] = telecom['total_rech_data_6'].replace(np.NaN,0.0)
telecom['total_rech_data_7'] = telecom['total_rech_data_7'].replace(np.NaN,0.0)
telecom['total_rech_data_8'] = telecom['total_rech_data_8'].replace(np.NaN,0.0)
telecom['av_rech_amt_data_6'] = telecom['av_rech_amt_data_6'].replace(np.NaN,0.0)
telecom['av_rech_amt_data_7'] = telecom['av_rech_amt_data_7'].replace(np.NaN,0.0)
telecom['av_rech_amt_data_8'] = telecom['av_rech_amt_data_8'].replace(np.NaN,0.0)
# Create new column: total recharge amount for data: total_rech_amt_data
telecom['total_rech_amt_data_6'] = telecom.av_rech_amt_data_6 * telecom.total_rech_data_6
telecom['total_rech_amt_data_7'] = telecom.av_rech_amt_data_7 * telecom.total_rech_data_7
telecom['total_rech_amt_data_8'] = telecom.av_rech_amt_data_8 * telecom.total_rech_data_8
# Create total average recharge amount for months 6 and 7
telecom['total_avg_rech_amt_6_7'] = (telecom.total_rech_amt_6 + telecom.total_rech_amt_data_6 \
+ telecom.total_rech_amt_7+ telecom.total_rech_amt_data_7)/2
# High value customers have recharge amount in the 70th percentile
high_value_filter = telecom.total_avg_rech_amt_6_7.quantile(0.7)
print('70 percentile of 6th and 7th months avg recharge amount: '+str(high_value_filter))
telecom_high_val_cust = telecom[telecom.total_avg_rech_amt_6_7 > high_value_filter]
print('Dataframe shape after filtering High Value Customers: ' + str(telecom_high_val_cust.shape))
70 percentile of 6th and 7th months avg recharge amount: 477.5 Dataframe shape after filtering High Value Customers: (20998, 176)
# New column total_rech_amt_data_* is created.
# Drop av_rech_amt_data_* and total_rech_data_*
columns_to_drop = ['av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8',
'total_rech_data_6','total_rech_data_7','total_rech_data_8']
telecom_high_val_cust = telecom_high_val_cust.drop(columns_to_drop, axis=1)
print("Dataframe shape after dropping transformed columns:",str(telecom_high_val_cust.shape))
Dataframe shape after dropping transformed columns: (20998, 170)
# Columns with single values do not add any value to data analysis or modeling; therefore, drop them.
biased_columns = []
for column in telecom_high_val_cust.columns:
if telecom_high_val_cust[column].nunique() ==1:
biased_columns.append(column)
print("Number of columns with biased data:",len(biased_columns))
print("We will drop them.")
# Dropping columns that have only one unique value
telecom_high_val_cust = telecom_high_val_cust.drop(biased_columns,axis=1)
print("Dataframe shape after dropping biased columns:",str(telecom_high_val_cust.shape))
Number of columns with biased data: 13 We will drop them. Dataframe shape after dropping biased columns: (20998, 157)
#date columns do not add any value to our analysis
date_columns = [col for col in telecom_high_val_cust.columns if 'date' in col]
telecom_high_val_cust = telecom_high_val_cust.drop(date_columns, axis=1)
# no analysis will be done on id
telecom_high_val_cust = telecom_high_val_cust.drop('id', axis=1)
# drop column created to identify high value customers
telecom_high_val_cust = telecom_high_val_cust.drop('total_avg_rech_amt_6_7', axis=1)
print("Dataframe shape after dropping columns not useful for analysis:",str(telecom_high_val_cust.shape))
Dataframe shape after dropping columns not useful for analysis: (20998, 149)
# Using missingno library, identify columns with missing values
msno.bar(telecom_high_val_cust)
<Axes: >
def metadata_matrix(data) :
return pd.DataFrame({
'Datatype' : telecom_high_val_cust.dtypes.astype(str),
'Null_Percentage': round(telecom_high_val_cust.isnull().sum()/len(telecom_high_val_cust) * 100 , 2),
'Unique_Values_Count': telecom_high_val_cust.nunique().astype(int)
}).sort_values(by='Null_Percentage', ascending=False)
metadata_matrix(telecom_high_val_cust)
| Datatype | Null_Percentage | Unique_Values_Count | |
|---|---|---|---|
| arpu_3g_8 | float64 | 46.78 | 4412 |
| fb_user_8 | float64 | 46.78 | 2 |
| arpu_2g_8 | float64 | 46.78 | 3604 |
| max_rech_data_8 | float64 | 46.78 | 46 |
| night_pck_user_8 | float64 | 46.78 | 2 |
| count_rech_3g_8 | float64 | 46.78 | 29 |
| count_rech_2g_8 | float64 | 46.78 | 33 |
| arpu_2g_6 | float64 | 44.35 | 4363 |
| count_rech_3g_6 | float64 | 44.35 | 23 |
| max_rech_data_6 | float64 | 44.35 | 46 |
| count_rech_2g_6 | float64 | 44.35 | 29 |
| fb_user_6 | float64 | 44.35 | 2 |
| night_pck_user_6 | float64 | 44.35 | 2 |
| arpu_3g_6 | float64 | 44.35 | 4627 |
| count_rech_3g_7 | float64 | 43.26 | 27 |
| night_pck_user_7 | float64 | 43.26 | 2 |
| max_rech_data_7 | float64 | 43.26 | 44 |
| fb_user_7 | float64 | 43.26 | 2 |
| arpu_3g_7 | float64 | 43.26 | 4572 |
| count_rech_2g_7 | float64 | 43.26 | 33 |
| arpu_2g_7 | float64 | 43.26 | 4102 |
| loc_ic_t2f_mou_8 | float64 | 3.91 | 3933 |
| isd_og_mou_8 | float64 | 3.91 | 690 |
| spl_og_mou_8 | float64 | 3.91 | 2779 |
| og_others_8 | float64 | 3.91 | 103 |
| loc_ic_t2t_mou_8 | float64 | 3.91 | 7801 |
| loc_ic_t2m_mou_8 | float64 | 3.91 | 12147 |
| std_ic_mou_8 | float64 | 3.91 | 6382 |
| loc_ic_mou_8 | float64 | 3.91 | 14101 |
| std_ic_t2t_mou_8 | float64 | 3.91 | 3517 |
| std_ic_t2m_mou_8 | float64 | 3.91 | 5071 |
| spl_ic_mou_8 | float64 | 3.91 | 79 |
| isd_ic_mou_8 | float64 | 3.91 | 2621 |
| ic_others_8 | float64 | 3.91 | 1020 |
| std_og_t2f_mou_8 | float64 | 3.91 | 1221 |
| std_og_mou_8 | float64 | 3.91 | 10986 |
| std_ic_t2f_mou_8 | float64 | 3.91 | 1579 |
| loc_og_mou_8 | float64 | 3.91 | 13524 |
| loc_og_t2f_mou_8 | float64 | 3.91 | 2526 |
| std_og_t2m_mou_8 | float64 | 3.91 | 9046 |
| onnet_mou_8 | float64 | 3.91 | 12309 |
| loc_og_t2t_mou_8 | float64 | 3.91 | 8179 |
| roam_ic_mou_8 | float64 | 3.91 | 2598 |
| loc_og_t2c_mou_8 | float64 | 3.91 | 1450 |
| loc_og_t2m_mou_8 | float64 | 3.91 | 11978 |
| roam_og_mou_8 | float64 | 3.91 | 3001 |
| std_og_t2t_mou_8 | float64 | 3.91 | 7749 |
| offnet_mou_8 | float64 | 3.91 | 15333 |
| std_ic_t2m_mou_6 | float64 | 1.89 | 5282 |
| loc_ic_mou_6 | float64 | 1.89 | 14454 |
| loc_og_mou_6 | float64 | 1.89 | 14173 |
| std_ic_t2t_mou_6 | float64 | 1.89 | 3609 |
| std_ic_t2f_mou_6 | float64 | 1.89 | 1608 |
| loc_og_t2m_mou_6 | float64 | 1.89 | 12396 |
| std_ic_mou_6 | float64 | 1.89 | 6631 |
| ic_others_6 | float64 | 1.89 | 989 |
| loc_og_t2c_mou_6 | float64 | 1.89 | 1432 |
| loc_og_t2f_mou_6 | float64 | 1.89 | 2609 |
| std_og_t2f_mou_6 | float64 | 1.89 | 1322 |
| spl_ic_mou_6 | float64 | 1.89 | 74 |
| loc_og_t2t_mou_6 | float64 | 1.89 | 8510 |
| loc_ic_t2f_mou_6 | float64 | 1.89 | 3994 |
| isd_ic_mou_6 | float64 | 1.89 | 2555 |
| offnet_mou_6 | float64 | 1.89 | 16091 |
| std_og_mou_6 | float64 | 1.89 | 11930 |
| onnet_mou_6 | float64 | 1.89 | 13084 |
| loc_ic_t2m_mou_6 | float64 | 1.89 | 12503 |
| roam_og_mou_6 | float64 | 1.89 | 3559 |
| loc_ic_t2t_mou_6 | float64 | 1.89 | 7960 |
| isd_og_mou_6 | float64 | 1.89 | 792 |
| roam_ic_mou_6 | float64 | 1.89 | 3134 |
| std_og_t2t_mou_6 | float64 | 1.89 | 8384 |
| og_others_6 | float64 | 1.89 | 755 |
| spl_og_mou_6 | float64 | 1.89 | 2632 |
| std_og_t2m_mou_6 | float64 | 1.89 | 9908 |
| onnet_mou_7 | float64 | 1.87 | 13118 |
| offnet_mou_7 | float64 | 1.87 | 16126 |
| ic_others_7 | float64 | 1.87 | 1107 |
| loc_og_t2m_mou_7 | float64 | 1.87 | 12364 |
| roam_ic_mou_7 | float64 | 1.87 | 2616 |
| roam_og_mou_7 | float64 | 1.87 | 2966 |
| loc_og_t2t_mou_7 | float64 | 1.87 | 8476 |
| isd_ic_mou_7 | float64 | 1.87 | 2686 |
| std_og_t2f_mou_7 | float64 | 1.87 | 1265 |
| loc_og_t2f_mou_7 | float64 | 1.87 | 2622 |
| spl_ic_mou_7 | float64 | 1.87 | 83 |
| std_og_mou_7 | float64 | 1.87 | 11861 |
| isd_og_mou_7 | float64 | 1.87 | 791 |
| std_og_t2m_mou_7 | float64 | 1.87 | 9878 |
| spl_og_mou_7 | float64 | 1.87 | 2843 |
| og_others_7 | float64 | 1.87 | 87 |
| loc_ic_t2t_mou_7 | float64 | 1.87 | 8021 |
| std_og_t2t_mou_7 | float64 | 1.87 | 8466 |
| loc_ic_t2m_mou_7 | float64 | 1.87 | 12503 |
| loc_ic_t2f_mou_7 | float64 | 1.87 | 4039 |
| loc_og_mou_7 | float64 | 1.87 | 14194 |
| loc_ic_mou_7 | float64 | 1.87 | 14495 |
| std_ic_t2t_mou_7 | float64 | 1.87 | 3703 |
| std_ic_mou_7 | float64 | 1.87 | 6703 |
| std_ic_t2f_mou_7 | float64 | 1.87 | 1666 |
| loc_og_t2c_mou_7 | float64 | 1.87 | 1479 |
| std_ic_t2m_mou_7 | float64 | 1.87 | 5391 |
| sachet_2g_8 | int64 | 0.00 | 32 |
| churn_probability | int64 | 0.00 | 2 |
| total_rech_amt_data_7 | float64 | 0.00 | 1448 |
| 3g_vbc_8 | float64 | 0.00 | 6959 |
| 3g_vbc_7 | float64 | 0.00 | 7100 |
| total_rech_amt_data_6 | float64 | 0.00 | 1357 |
| monthly_2g_6 | int64 | 0.00 | 5 |
| monthly_2g_7 | int64 | 0.00 | 6 |
| 3g_vbc_6 | float64 | 0.00 | 6599 |
| sachet_2g_7 | int64 | 0.00 | 33 |
| sachet_3g_8 | int64 | 0.00 | 28 |
| sachet_3g_7 | int64 | 0.00 | 26 |
| sachet_3g_6 | int64 | 0.00 | 23 |
| monthly_3g_8 | int64 | 0.00 | 12 |
| monthly_3g_7 | int64 | 0.00 | 13 |
| monthly_3g_6 | int64 | 0.00 | 10 |
| monthly_2g_8 | int64 | 0.00 | 6 |
| sachet_2g_6 | int64 | 0.00 | 29 |
| aon | int64 | 0.00 | 3139 |
| arpu_6 | float64 | 0.00 | 20453 |
| vol_3g_mb_8 | float64 | 0.00 | 7157 |
| total_rech_amt_6 | int64 | 0.00 | 2093 |
| arpu_8 | float64 | 0.00 | 19775 |
| total_og_mou_6 | float64 | 0.00 | 17265 |
| total_og_mou_7 | float64 | 0.00 | 17392 |
| total_og_mou_8 | float64 | 0.00 | 16445 |
| arpu_7 | float64 | 0.00 | 20461 |
| total_ic_mou_6 | float64 | 0.00 | 15440 |
| total_ic_mou_7 | float64 | 0.00 | 15480 |
| total_ic_mou_8 | float64 | 0.00 | 15089 |
| total_rech_num_6 | int64 | 0.00 | 94 |
| total_rech_num_7 | int64 | 0.00 | 96 |
| total_rech_num_8 | int64 | 0.00 | 90 |
| total_rech_amt_7 | int64 | 0.00 | 2129 |
| vol_3g_mb_7 | float64 | 0.00 | 7631 |
| total_rech_amt_8 | int64 | 0.00 | 2104 |
| max_rech_amt_6 | int64 | 0.00 | 159 |
| max_rech_amt_7 | int64 | 0.00 | 153 |
| max_rech_amt_8 | int64 | 0.00 | 170 |
| last_day_rch_amt_6 | int64 | 0.00 | 150 |
| last_day_rch_amt_7 | int64 | 0.00 | 145 |
| last_day_rch_amt_8 | int64 | 0.00 | 170 |
| vol_2g_mb_6 | float64 | 0.00 | 8406 |
| vol_2g_mb_7 | float64 | 0.00 | 8376 |
| vol_2g_mb_8 | float64 | 0.00 | 7622 |
| vol_3g_mb_6 | float64 | 0.00 | 7231 |
| total_rech_amt_data_8 | float64 | 0.00 | 1343 |
Impute null values with 0 since null value means no recharge has happened.
telecom_high_val_cust['max_rech_data_6'] = telecom_high_val_cust['max_rech_data_6'].replace(np.NaN,0.0)
telecom_high_val_cust['max_rech_data_7'] = telecom_high_val_cust['max_rech_data_7'].replace(np.NaN,0.0)
telecom_high_val_cust['max_rech_data_8'] = telecom_high_val_cust['max_rech_data_8'].replace(np.NaN,0.0)
All attributes that have more than 40% null values will be dropped.
# Calculate the percentage of missing values for each column
missing_percentages = telecom_high_val_cust.isnull().mean() * 100
# Identify columns with 40% or more missing values
columns_with_high_missing_values = missing_percentages[missing_percentages >= 40].index
# Dropping columns that have more than 40% missing values
telecom_high_val_cust = telecom_high_val_cust.drop(columns_with_high_missing_values,axis=1)
print("Dataframe shape after dropping columns with high missing values:",str(telecom_high_val_cust.shape))
Dataframe shape after dropping columns with high missing values: (20998, 131)
Since mou columns have less than 5% missing data, we will impute them with 0 to indicate that this service was not used by the customer.
col_mou = [col for col in telecom_high_val_cust.columns if 'mou' in col]
telecom_high_val_cust[col_mou] = telecom_high_val_cust[col_mou].replace(np.NaN,0.0)
col_og_ic_mou = ['loc_og_mou_6','loc_og_mou_7','loc_og_mou_8',
'std_og_mou_6','std_og_mou_7','std_og_mou_8',
'total_og_mou_6','total_og_mou_7','total_og_mou_8',
'loc_ic_mou_6','loc_ic_mou_7','loc_ic_mou_8',
'std_ic_mou_6','std_ic_mou_7','std_ic_mou_8',
'total_ic_mou_6','total_ic_mou_7','total_ic_mou_8']
telecom_high_val_cust = telecom_high_val_cust.drop(col_og_ic_mou, axis=1)
print("Dataframe shape after dropping repetitive(correlated) columns:",str(telecom_high_val_cust.shape))
Dataframe shape after dropping repetitive(correlated) columns: (20998, 113)
Impute other ic/og columns with 0 since they have less than 5% mising data.
col_og_ic = [col for col in telecom_high_val_cust.columns if 'ic' in col or 'og' in col]
telecom_high_val_cust[col_og_ic] = telecom_high_val_cust[col_og_ic].replace(np.NaN,0.0)
telecom_df = telecom_high_val_cust.copy()
col_6_7 = [col for col in telecom_df.columns if '_6' in col or '_7' in col]
col_6_7_common = [col[:-2] for col in telecom_df.columns if '_6' in col or '_7' in col]
col_6_7_common = list(set(col_6_7_common))
for idx, col in enumerate(col_6_7_common):
avg_col_6_7 = "avg_"+col+"_6_7"
col_6 = col+"_6"
col_7 = col+"_7"
telecom_df[avg_col_6_7] = (telecom_df[col_6] + telecom_df[col_7])/ 2
telecom_df = telecom_df.drop(col_6_7, axis = 1)
print("Dataframe shape after dropping transformed columns:",str(telecom_df.shape))
Dataframe shape after dropping transformed columns: (20998, 76)
col_6_7 = [col for col in telecom_df.columns if '_6_7' in col]
plt.figure(figsize = (20,15))
sns.heatmap(telecom_df[col_6_7].corr(),annot = True)
plt.show()
We do not see any strong correlations that need to be addressed. One thing we notice is as recharge amount increases, average revenue per user also increases.
final_column_list = telecom_df.columns.to_list()
# Exclude target variable
final_column_list.remove('churn_probability')
print("Number of columns that will be used for model building:", len(final_column_list))
Number of columns that will be used for model building: 75
# Determine skewness for all numeric variables
telecom_df.skew()
arpu_8 18.145858 onnet_mou_8 5.372370 offnet_mou_8 4.758479 roam_ic_mou_8 20.457859 roam_og_mou_8 16.380833 loc_og_t2t_mou_8 15.260752 loc_og_t2m_mou_8 4.097068 loc_og_t2f_mou_8 10.643422 loc_og_t2c_mou_8 16.890732 std_og_t2t_mou_8 5.537740 std_og_t2m_mou_8 6.833574 std_og_t2f_mou_8 15.496891 isd_og_mou_8 81.027659 spl_og_mou_8 15.237698 og_others_8 107.578471 loc_ic_t2t_mou_8 11.065714 loc_ic_t2m_mou_8 4.636332 loc_ic_t2f_mou_8 11.269502 std_ic_t2t_mou_8 20.611378 std_ic_t2m_mou_8 12.405555 std_ic_t2f_mou_8 36.590324 spl_ic_mou_8 4.853747 isd_ic_mou_8 17.759014 ic_others_8 46.307587 total_rech_num_8 2.800001 total_rech_amt_8 21.419988 max_rech_amt_8 5.608865 last_day_rch_amt_8 7.054781 max_rech_data_8 2.897118 vol_2g_mb_8 8.138860 vol_3g_mb_8 9.172511 monthly_2g_8 2.890655 sachet_2g_8 3.951159 monthly_3g_8 5.010947 sachet_3g_8 13.462701 aon 0.986145 3g_vbc_8 5.916601 churn_probability 3.032157 total_rech_amt_data_8 18.335753 avg_og_others_6_7 89.368833 avg_std_ic_t2m_mou_6_7 11.667077 avg_std_og_t2t_mou_6_7 4.412864 avg_total_rech_amt_data_6_7 9.434197 avg_std_ic_t2t_mou_6_7 24.908280 avg_3g_vbc_6_7 4.800676 avg_spl_og_mou_6_7 20.505711 avg_ic_others_6_7 52.905578 avg_loc_ic_t2f_mou_6_7 9.501778 avg_total_rech_amt_6_7 21.257848 avg_std_ic_t2f_mou_6_7 33.144514 avg_monthly_2g_6_7 2.327500 avg_isd_ic_mou_6_7 31.879174 avg_loc_og_t2f_mou_6_7 10.783785 avg_roam_og_mou_6_7 9.009688 avg_spl_ic_mou_6_7 66.553173 avg_loc_og_t2c_mou_6_7 23.811981 avg_std_og_t2m_mou_6_7 4.237469 avg_isd_og_mou_6_7 81.977557 avg_sachet_2g_6_7 3.545139 avg_loc_og_t2m_mou_6_7 3.676190 avg_arpu_6_7 20.881621 avg_last_day_rch_amt_6_7 5.431195 avg_total_rech_num_6_7 2.718843 avg_onnet_mou_6_7 3.898005 avg_vol_3g_mb_6_7 9.355957 avg_sachet_3g_6_7 11.164401 avg_std_og_t2f_mou_6_7 17.067401 avg_loc_ic_t2m_mou_6_7 4.356972 avg_roam_ic_mou_6_7 15.069792 avg_max_rech_amt_6_7 4.340042 avg_vol_2g_mb_6_7 5.958775 avg_max_rech_data_6_7 2.871534 avg_offnet_mou_6_7 3.156728 avg_loc_og_t2t_mou_6_7 9.560443 avg_monthly_3g_6_7 4.334295 avg_loc_ic_t2t_mou_6_7 10.305137 dtype: float64
# Visualize the percentage change between consecutive quantiles for the numeric columns in the telecom dataset.
telecom_df.quantile(np.arange(0.9,1.01,0.01), axis=0, numeric_only=True).pct_change().mul(100).style.bar()
# Identify columns with outliers
# Using the percentage change between consecutive quantiles to identify columns with potentially large changes
# The threshold for considering a change as an outlier is set to a percentage change greater than 100
pct_change_99_1 = telecom_df.quantile(np.arange(0.9,1.01,0.01),axis=0, numeric_only=True).pct_change().mul(100).iloc[-1]
outlier_condition = pct_change_99_1 > 100
columns_with_outliers = pct_change_99_1[outlier_condition].index.values
print('Number of columns with outliers :', len(columns_with_outliers))
Number of columns with outliers : 74
# Treat the outliers - Values above the 99th percentile for each column are capped at the threshold
# Threshold = 99th percentile values
outlier_treatment = pd.DataFrame(columns=['Column', 'Outlier Threshold', 'Outliers replaced'])
for col in columns_with_outliers :
outlier_threshold = telecom_df[col].quantile(0.99)
condition = telecom_df[col] > outlier_threshold
df = pd.DataFrame({'Column' : col,'Outlier Threshold': outlier_threshold,'Outliers replaced': telecom_df.loc[condition,col].shape[0]}, index=[0])
outlier_treatment = pd.concat([outlier_treatment, df])
telecom_df.loc[condition, col] = outlier_threshold
outlier_treatment
| Column | Outlier Threshold | Outliers replaced | |
|---|---|---|---|
| 0 | arpu_8 | 1971.592470 | 210 |
| 0 | onnet_mou_8 | 2153.499100 | 210 |
| 0 | offnet_mou_8 | 2157.510400 | 210 |
| 0 | roam_ic_mou_8 | 274.131100 | 210 |
| 0 | roam_og_mou_8 | 395.072000 | 210 |
| 0 | loc_og_t2t_mou_8 | 899.539400 | 210 |
| 0 | loc_og_t2m_mou_8 | 1065.090700 | 210 |
| 0 | loc_og_t2f_mou_8 | 82.210900 | 210 |
| 0 | loc_og_t2c_mou_8 | 27.212100 | 210 |
| 0 | std_og_t2t_mou_8 | 1889.473300 | 210 |
| 0 | std_og_t2m_mou_8 | 1873.920100 | 210 |
| 0 | std_og_t2f_mou_8 | 34.980600 | 210 |
| 0 | isd_og_mou_8 | 28.797200 | 210 |
| 0 | spl_og_mou_8 | 71.797500 | 210 |
| 0 | og_others_8 | 0.000000 | 126 |
| 0 | loc_ic_t2t_mou_8 | 619.390100 | 210 |
| 0 | loc_ic_t2m_mou_8 | 955.963900 | 210 |
| 0 | loc_ic_t2f_mou_8 | 179.601900 | 210 |
| 0 | std_ic_t2t_mou_8 | 211.139000 | 210 |
| 0 | std_ic_t2m_mou_8 | 357.242300 | 210 |
| 0 | std_ic_t2f_mou_8 | 45.050200 | 210 |
| 0 | spl_ic_mou_8 | 0.560000 | 206 |
| 0 | isd_ic_mou_8 | 227.415100 | 210 |
| 0 | ic_others_8 | 21.350900 | 210 |
| 0 | total_rech_num_8 | 44.000000 | 207 |
| 0 | total_rech_amt_8 | 2328.120000 | 210 |
| 0 | max_rech_amt_8 | 951.000000 | 199 |
| 0 | last_day_rch_amt_8 | 565.000000 | 207 |
| 0 | max_rech_data_8 | 455.000000 | 120 |
| 0 | vol_2g_mb_8 | 1449.599000 | 210 |
| 0 | vol_3g_mb_8 | 3917.502400 | 210 |
| 0 | monthly_2g_8 | 2.000000 | 27 |
| 0 | sachet_2g_8 | 12.000000 | 163 |
| 0 | monthly_3g_8 | 3.000000 | 102 |
| 0 | sachet_3g_8 | 5.000000 | 149 |
| 0 | 3g_vbc_8 | 1965.210600 | 210 |
| 0 | total_rech_amt_data_8 | 5688.720000 | 210 |
| 0 | avg_og_others_6_7 | 4.595000 | 209 |
| 0 | avg_std_ic_t2m_mou_6_7 | 373.329800 | 210 |
| 0 | avg_std_og_t2t_mou_6_7 | 1788.015650 | 210 |
| 0 | avg_total_rech_amt_data_6_7 | 5134.680000 | 210 |
| 0 | avg_std_ic_t2t_mou_6_7 | 196.187250 | 210 |
| 0 | avg_3g_vbc_6_7 | 1817.028400 | 210 |
| 0 | avg_spl_og_mou_6_7 | 63.476800 | 210 |
| 0 | avg_ic_others_6_7 | 18.902100 | 210 |
| 0 | avg_loc_ic_t2f_mou_6_7 | 190.901200 | 210 |
| 0 | avg_total_rech_amt_6_7 | 2202.090000 | 210 |
| 0 | avg_std_ic_t2f_mou_6_7 | 46.967250 | 210 |
| 0 | avg_monthly_2g_6_7 | 1.500000 | 80 |
| 0 | avg_isd_ic_mou_6_7 | 222.025650 | 210 |
| 0 | avg_loc_og_t2f_mou_6_7 | 84.923350 | 210 |
| 0 | avg_roam_og_mou_6_7 | 448.911150 | 210 |
| 0 | avg_spl_ic_mou_6_7 | 0.410000 | 209 |
| 0 | avg_loc_og_t2c_mou_6_7 | 21.565150 | 210 |
| 0 | avg_std_og_t2m_mou_6_7 | 1859.086850 | 210 |
| 0 | avg_isd_og_mou_6_7 | 39.886150 | 210 |
| 0 | avg_sachet_2g_6_7 | 11.000000 | 208 |
| 0 | avg_loc_og_t2m_mou_6_7 | 1071.498200 | 210 |
| 0 | avg_arpu_6_7 | 1841.591235 | 210 |
| 0 | avg_last_day_rch_amt_6_7 | 549.500000 | 208 |
| 0 | avg_total_rech_num_6_7 | 45.000000 | 206 |
| 0 | avg_onnet_mou_6_7 | 2059.418800 | 210 |
| 0 | avg_vol_3g_mb_6_7 | 3482.782950 | 210 |
| 0 | avg_sachet_3g_6_7 | 4.000000 | 192 |
| 0 | avg_std_og_t2f_mou_6_7 | 36.938150 | 210 |
| 0 | avg_loc_ic_t2m_mou_6_7 | 929.915900 | 210 |
| 0 | avg_roam_ic_mou_6_7 | 272.755800 | 210 |
| 0 | avg_max_rech_amt_6_7 | 790.000000 | 209 |
| 0 | avg_vol_2g_mb_6_7 | 1364.016450 | 210 |
| 0 | avg_max_rech_data_6_7 | 449.000000 | 196 |
| 0 | avg_offnet_mou_6_7 | 2157.215050 | 210 |
| 0 | avg_loc_og_t2t_mou_6_7 | 975.177750 | 210 |
| 0 | avg_monthly_3g_6_7 | 2.500000 | 164 |
| 0 | avg_loc_ic_t2t_mou_6_7 | 631.440950 | 210 |
# Copy the dataset and perform eda so that existing columns are not disturbed
telecom_eda = telecom_df.copy()
churn = telecom_eda[telecom_eda['churn_probability']==1]
not_churn = telecom_eda[telecom_eda['churn_probability']==0]
print("Churn dataset:",churn.shape)
print("Non-churn dataset:",not_churn.shape)
Churn dataset: (1735, 76) Non-churn dataset: (19263, 76)
# Create labels for better graph understanding
telecom_eda['churn_labels'] = telecom_eda['churn_probability'].map({0: 'No', 1: 'Yes'})
lab = telecom_eda['churn_labels'].value_counts().keys().tolist()
val = telecom_eda['churn_labels'].value_counts().values.tolist()
trace = go.Pie(labels = lab,
values = val,
marker = dict(colors = ['royalblue', 'lime'],
line = dict(color = "white",
width = 1.3)
),
rotation = 90,
hoverinfo = "label+value+text",
hole = .5
)
layout = go.Layout(dict(title = "Customer Churn in Data",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
)
data = [trace]
fig = go.Figure(data = data, layout = layout)
pyo.iplot(fig)
We see that 8.26% of the high-value customers currently churn or move to other networks. Our goal in this case study is to:
def histogram(column):
trace1 = go.Histogram(x = not_churn[column],
histnorm="percent",
name="Non-churn Customers",
marker = dict(line = dict(width = 0.5,
color = "black"
)
),
opacity = .9
)
trace2 = go.Histogram(x = churn[column],
histnorm="percent",
name="Churn Customers",
marker = dict(line = dict(width = 0.5,
color = "black"
)
),
opacity = .9
)
data = [trace1, trace2]
layout = go.Layout(dict(title = column + " - Distribution in Customer Churn",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth = 1,
ticklen = 5,
gridwidth = 2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth = 1,
ticklen = 5,
gridwidth = 2
),
)
)
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)
for i in final_column_list:
histogram(i)
# Create new column tenure from age on network (aon) for better analysis
# Age on network(aon) - number of days the customer is using the operator T network
telecom_eda['tenure'] = telecom_eda['aon']/30
tenure_range = [0, 6, 12, 24, 60, 61]
tenure_label = [ '0-6 Months', '6-12 Months', '1-2 Yrs', '2-5 Yrs', '5 Yrs and above']
telecom_eda['tenure_range'] = pd.cut(telecom_eda['tenure'], tenure_range, labels=tenure_label)
telecom_eda['tenure_range'].head()
4 1-2 Yrs 15 1-2 Yrs 23 1-2 Yrs 24 2-5 Yrs 27 1-2 Yrs Name: tenure_range, dtype: category Categories (5, object): ['0-6 Months' < '6-12 Months' < '1-2 Yrs' < '2-5 Yrs' < '5 Yrs and above']
churn = telecom_eda[telecom_eda['churn_probability']==1]
not_churn = telecom_eda[telecom_eda['churn_probability']==0]
# Plot Graph to Display Percentage of Customers Churned Across Various Tenures
tenure_total = telecom_eda['tenure_range'].value_counts().sort_index()
tenure_churn = churn['tenure_range'].value_counts().sort_index()
tenure_churn_per = (tenure_churn.values / tenure_total.values)*100
tenure_nchurn = not_churn['tenure_range'].value_counts().sort_index()
tenure_nchurn_per = (tenure_nchurn.values / tenure_total.values)*100
fig = go.Figure()
fig.add_trace(go.Bar(
x=tenure_total.index,
y=tenure_churn_per,
name='Customer Churning',
offset=-0.2,
marker=dict(color='Red')
))
fig.add_trace(go.Bar(
x=tenure_total.index,
y=tenure_nchurn_per,
name='Customer Not Churning',
offset=+0.2,
marker=dict(color='Blue')
))
bar_width = 0.4
fig.update_traces(width=bar_width)
fig.update_layout(
title='Customer Churning Status Per Tenure Range',
xaxis_title='Tenure Range',
yaxis_title='Percentage of Customers',
)
fig.show()
# scatter plot for total charges and max recharges by tenure group
def plot_tenure_scatter(tenure_range, color) :
tracer = go.Scatter(x = telecom_eda[telecom_eda["tenure_range"] == tenure_range]["total_rech_amt_8"],
y = telecom_eda[telecom_eda["tenure_range"] == tenure_range]["max_rech_amt_8"],
mode = "markers",
marker = dict(line = dict(color = "black",
width = .2),
size = 4, color = color,
symbol = "diamond-dot",
),
name = tenure_range,
opacity = .9
)
return tracer
# scatter plot for total charges and max recharges by churn group
def plot_churn_scatter(churn, color) :
tracer = go.Scatter(x = telecom_eda[telecom_eda["churn_labels"] == churn]["total_rech_amt_8"],
y = telecom_eda[telecom_eda["churn_labels"] == churn]["max_rech_amt_8"],
mode = "markers",
marker = dict(line = dict(color = "black",
width = .2),
size = 4, color = color,
symbol = "diamond-dot",
),
name = churn,
opacity = .9
)
return tracer
trace1 = plot_tenure_scatter("0-6 Months","#FF3366")
trace2 = plot_tenure_scatter("6-12 Months","#6666FF")
trace3 = plot_tenure_scatter("1-2 Yrs","#00FF00")
trace4 = plot_tenure_scatter("2-5 Yrs","#996600")
trace5 = plot_tenure_scatter("5 Yrs and above","grey")
trace6 = plot_churn_scatter("Yes","red")
trace7 = plot_churn_scatter("No","blue")
data1 = [trace1, trace3, trace4, trace5, trace2]
data2 = [trace7, trace6]
#layout
def layout_title(title):
layout = go.Layout(dict(title = title,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255,255,255)',
title = "Total charges",
zerolinewidth=1, ticklen = 5, gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255,255,255)',
title = "Maximum charges",
zerolinewidth=1, ticklen = 5, gridwidth=2),
height = 600
)
)
return layout
layout1 = layout_title("Monthly Charges & Max Charges by Tenure Range for August")
layout2 = layout_title("Monthly Charges & Max Charges by Churn Probability for August ")
fig1 = go.Figure(data = data1, layout = layout1)
fig2 = go.Figure(data = data2, layout = layout2)
pyo.iplot(fig1)
pyo.iplot(fig2)
def scatter_month(x_data,y_data):
fig = px.scatter(telecom_eda, x=x_data, y=y_data, color='churn_labels')
pyo.iplot(fig)
scatter_month('avg_total_rech_amt_6_7', 'total_rech_amt_8')
scatter_month('avg_max_rech_data_6_7', 'max_rech_data_8')
scatter_month('avg_last_day_rch_amt_6_7', 'last_day_rch_amt_8')
scatter_month('avg_arpu_6_7','arpu_8')
scatter_month('avg_roam_og_mou_6_7','roam_og_mou_8')
scatter_month('avg_roam_ic_mou_6_7','roam_ic_mou_8')
# Independent variables/Features in X
X = telecom_df.drop('churn_probability', axis = 1)
# Target variable in y
y = telecom_df['churn_probability']
# Splitting data into train and test set 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)
We know there is data imbalance in the dataset as seen in the pie chart in EDA. 91.7% customers do not churn, only 8.26% customers churn. We will deal with the imbalance using SMOTE (Synthetic Minority Oversampling Technique)
# SMOTE
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
As seen in the metadat matrix of the dataset, the values of attributes lie in various ranges. Therefore, we will scale them using standard scaler technique.
# Instantiate the Scaler
scaler = StandardScaler()
# Fit the data into scaler and transform
X_train[final_column_list] = scaler.fit_transform(X_train[final_column_list])
# Transform the test set
X_test[final_column_list] = scaler.transform(X_test[final_column_list])
# Instantiate PCA
pca = PCA(random_state=42)
# Fit train set on PCA
pca.fit(X_train)
# Principal components
pca.components_
# Cumuliative variance of the principal components
variance_cumu = np.cumsum(pca.explained_variance_ratio_)
# Plotting scree plot
fig = plt.figure(figsize=[12,8])
plt.vlines(x=46, ymax=1, ymin=0, colors="r", linestyles="--")
plt.hlines(y=0.95, xmax=80, xmin=0, colors="g", linestyles="--")
plt.plot(variance_cumu)
plt.xlabel('Number of Components')
plt.ylabel("Cumulative variance explained")
plt.show()
About 45 components explain roughly about 95% of the variance in the dataset. Therefore, we will build PCA with 45 components.
# Instantiate PCA with 45 components
pca_final = IncrementalPCA(n_components=45)
# Fit and transform the X_train
X_train_pca = pca_final.fit_transform(X_train)
X_test_pca = pca_final.transform(X_test)
def model_metrics(y_data, y_data_pred):
table_data = []
confusion = metrics.confusion_matrix(y_data, y_data_pred)
TP = confusion[1,1] # true positive
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives
# Accuracy
accuracy = round(metrics.accuracy_score(y_data, y_data_pred),3)
# Sensitivity
sensitivity = round(TP/float(FN + TP),3)
# Specificity
specificity = round(TN / float(TN+FP),3)
#Precision
precision = round(TP/float(TP + FP),3)
#F1 score
f1_score = round(2*precision*sensitivity/(precision + sensitivity),3)
# Add metrics to table data
table_data.append(["Confusion matrix", confusion])
table_data.append(["Accuracy", accuracy])
table_data.append(["Sensitivity/Recall", sensitivity])
table_data.append(["Specificity", specificity])
table_data.append(["Precision", precision])
table_data.append(["F1-score", f1_score])
# Print the formatted table
print(tabulate(table_data, headers=["Metric", "Value"], tablefmt="grid"))
# Creating KFold object with 5 splits
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=4)
# Specify params
params = {"C": [0.01, 0.1, 1, 10, 100, 1000]
}
# Specifing score as recall as we are more focused on acheiving the higher sensitivity than the accuracy
log1_model_cv = GridSearchCV(estimator = LogisticRegression(),
param_grid = params,
scoring= 'roc_auc',
cv = folds,
verbose = 1,
return_train_score=True)
# Fit the model
log1_model_cv.fit(X_train_pca, y_train)
Fitting 3 folds for each of 6 candidates, totalling 18 fits
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=4, shuffle=True),
estimator=LogisticRegression(),
param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000]},
return_train_score=True, scoring='roc_auc', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=4, shuffle=True),
estimator=LogisticRegression(),
param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000]},
return_train_score=True, scoring='roc_auc', verbose=1)LogisticRegression()
LogisticRegression()
# results of grid search CV
cv_results = pd.DataFrame(log1_model_cv.cv_results_)
# print best hyperparameters
print("Best AUC: ", log1_model_cv.best_score_)
print("Best hyperparameters: ", log1_model_cv.best_params_)
Best AUC: 0.914145455747912
Best hyperparameters: {'C': 100}
# Instantiate the model with best C
logistic_pca = LogisticRegression(C=100)
# Fit the model on the train set
log1_pca_model = logistic_pca.fit(X_train_pca, y_train)
# Predictions on the train set
y_train_pred = log1_pca_model.predict(X_train_pca)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = log1_pca_model.predict(X_test_pca)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[12661 2739] | | | [ 2187 13213]] | +--------------------+-----------------+ | Accuracy | 0.84 | +--------------------+-----------------+ | Sensitivity/Recall | 0.858 | +--------------------+-----------------+ | Specificity | 0.822 | +--------------------+-----------------+ | Precision | 0.828 | +--------------------+-----------------+ | F1-score | 0.843 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3171 692] | | | [ 63 274]] | +--------------------+---------------+ | Accuracy | 0.82 | +--------------------+---------------+ | Sensitivity/Recall | 0.813 | +--------------------+---------------+ | Specificity | 0.821 | +--------------------+---------------+ | Precision | 0.284 | +--------------------+---------------+ | F1-score | 0.421 | +--------------------+---------------+
# SVM Linear Vs RBF - to determine which kernel to be used
model = SVC(kernel="linear")
model.fit(X_train_pca, y_train)
y_test_pred = model.predict(X_test_pca)
accuracy = round(metrics.accuracy_score(y_test, y_test_pred),3)
print("Accuracy of linear model is:", accuracy)
Accuracy of linear model is: 0.832
model = SVC(kernel="rbf")
model.fit(X_train_pca, y_train)
y_test_pred = model.predict(X_test_pca)
accuracy = round(metrics.accuracy_score(y_test, y_test_pred),3)
print("Accuracy of rbf model is:", accuracy)
Accuracy of rbf model is: 0.885
We see that SVM with Radial Basis Function gives better accuracy. Therefore, let's tune parameters for rbf model.
# specify range of hyperparameters
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 100)
hyper_params = [ {'gamma': [1e-1, 1e-2],
'C': [1, 10]}]
# specify model with RBF kernel
model = SVC(kernel="rbf")
# set up GridSearchCV()
svm1_model_cv = GridSearchCV(estimator = model,
param_grid = hyper_params,
scoring= 'accuracy',
cv = folds,
verbose = 1,
refit = True,
return_train_score=True,
n_jobs = -1)
# fit the model
svm1_model_cv.fit(X_train_pca, y_train)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'gamma': [0.1, 0.01]}],
return_train_score=True, scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'gamma': [0.1, 0.01]}],
return_train_score=True, scoring='accuracy', verbose=1)SVC()
SVC()
# cv results
cv_results = pd.DataFrame(svm1_model_cv.cv_results_)
cv_results
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_gamma | params | split0_test_score | split1_test_score | split2_test_score | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 176.286576 | 12.767550 | 25.489950 | 1.543002 | 1 | 0.1 | {'C': 1, 'gamma': 0.1} | 0.959482 | 0.960748 | 0.955776 | 0.958669 | 0.002110 | 2 | 0.982078 | 0.982321 | 0.983150 | 0.982516 | 0.000459 |
| 1 | 35.476198 | 0.300432 | 25.920587 | 0.431521 | 1 | 0.01 | {'C': 1, 'gamma': 0.01} | 0.907178 | 0.910393 | 0.905903 | 0.907825 | 0.001889 | 4 | 0.917791 | 0.916086 | 0.916967 | 0.916948 | 0.000696 |
| 2 | 168.488312 | 22.244605 | 24.981936 | 1.646983 | 10 | 0.1 | {'C': 10, 'gamma': 0.1} | 0.972144 | 0.971267 | 0.967076 | 0.970162 | 0.002212 | 1 | 0.997273 | 0.996786 | 0.997175 | 0.997078 | 0.000210 |
| 3 | 35.726973 | 0.368254 | 19.378474 | 0.277905 | 10 | 0.01 | {'C': 10, 'gamma': 0.01} | 0.937957 | 0.937275 | 0.936295 | 0.937175 | 0.000682 | 3 | 0.952905 | 0.953197 | 0.952713 | 0.952938 | 0.000199 |
# print best hyperparameters
print("Best AUC: ", svm1_model_cv.best_score_)
print("Best hyperparameters: ", svm1_model_cv.best_params_)
Best AUC: 0.9701622374528146
Best hyperparameters: {'C': 10, 'gamma': 0.1}
# Building the model with optimal hyperparameters
svm1_pca_model = SVC(C=10, gamma=0.1, kernel="rbf")
svm1_pca_model.fit(X_train_pca, y_train)
# Predictions on the train set
y_train_pred = svm1_pca_model.predict(X_train_pca)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = svm1_pca_model.predict(X_test_pca)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[15310 90] | | | [ 2 15398]] | +--------------------+-----------------+ | Accuracy | 0.997 | +--------------------+-----------------+ | Sensitivity/Recall | 1.0 | +--------------------+-----------------+ | Specificity | 0.994 | +--------------------+-----------------+ | Precision | 0.994 | +--------------------+-----------------+ | F1-score | 0.997 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3742 121] | | | [ 210 127]] | +--------------------+---------------+ | Accuracy | 0.921 | +--------------------+---------------+ | Sensitivity/Recall | 0.377 | +--------------------+---------------+ | Specificity | 0.969 | +--------------------+---------------+ | Precision | 0.512 | +--------------------+---------------+ | F1-score | 0.434 | +--------------------+---------------+
# specify range of hyperparameters
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 100)
hyper_params = [ {'gamma': [1e-1, 1e-2],
'C': [1, 10],
'class_weight': ['balanced']}]
# specify model with RBF kernel
model = SVC(kernel="rbf")
# set up GridSearchCV()
svm2_model_cv = GridSearchCV(estimator = model,
param_grid = hyper_params,
scoring= 'accuracy',
cv = folds,
verbose = 1,
refit = True,
return_train_score=True,
n_jobs = -1)
# fit the model
svm2_model_cv.fit(X_train_pca, y_train)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'class_weight': ['balanced'],
'gamma': [0.1, 0.01]}],
return_train_score=True, scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'class_weight': ['balanced'],
'gamma': [0.1, 0.01]}],
return_train_score=True, scoring='accuracy', verbose=1)SVC()
SVC()
# print best hyperparameters
print("Best AUC: ", svm2_model_cv.best_score_)
print("Best hyperparameters: ", svm2_model_cv.best_params_)
Best AUC: 0.9701622374528146
Best hyperparameters: {'C': 10, 'class_weight': 'balanced', 'gamma': 0.1}
# Building the model with optimal hyperparameters
svm2_pca_model = SVC(C=10, class_weight="balanced", gamma=0.1, kernel="rbf", degree=2)
svm2_pca_model.fit(X_train_pca, y_train)
# Predictions on the train set
y_train_pred = svm2_pca_model.predict(X_train_pca)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = svm2_pca_model.predict(X_test_pca)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[15310 90] | | | [ 2 15398]] | +--------------------+-----------------+ | Accuracy | 0.997 | +--------------------+-----------------+ | Sensitivity/Recall | 1.0 | +--------------------+-----------------+ | Specificity | 0.994 | +--------------------+-----------------+ | Precision | 0.994 | +--------------------+-----------------+ | F1-score | 0.997 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3742 121] | | | [ 210 127]] | +--------------------+---------------+ | Accuracy | 0.921 | +--------------------+---------------+ | Sensitivity/Recall | 0.377 | +--------------------+---------------+ | Specificity | 0.969 | +--------------------+---------------+ | Precision | 0.512 | +--------------------+---------------+ | F1-score | 0.434 | +--------------------+---------------+
# Basic RFC to check accuracy
rfc = RandomForestClassifier()
rfc.fit(X_train_pca,y_train)
y_test_pred = rfc.predict(X_test_pca)
accuracy = round(metrics.accuracy_score(y_test, y_test_pred),3)
print("Accuracy of basic rfc model is:", accuracy)
Accuracy of basic rfc model is: 0.913
We can see that Random Forest Classifier performs very well and therefore, we will go ahead and tune its parameters.
# define a function for tuning parameters for Random Forest individually
def hpt(params, x_train, y_train, scoring, max_depth=0):
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 100)
if(max_depth==0):
rf = RandomForestClassifier()
else :
rf = RandomForestClassifier(max_depth=max_depth)
grid_search = GridSearchCV(estimator = rf,
param_grid = params,
cv = folds,
n_jobs = -1,
scoring = scoring,
refit = True,
return_train_score=True)
grid_search.fit(x_train, y_train)
scores = grid_search.cv_results_
for key in params.keys():
hyperparameters = key
break
# plotting accuracies for parameters
plt.figure(figsize=(16,5))
plt.plot(scores["param_"+hyperparameters], scores["mean_train_score"], label="training accuracy")
plt.plot(scores["param_"+hyperparameters], scores["mean_test_score"], label="test accuracy")
plt.xlabel(hyperparameters)
plt.ylabel("Accuracy")
plt.legend()
plt.show()
# Tuning max_depth
params = {'max_depth': range(2, 40, 5)}
hpt(params,X_train_pca,y_train, "accuracy")
We see that as max_depth incraeses, the training and test accuracy increases.
We will use 12 and 17 for gridview search as they have peaks.
# Tuning min_samples_leaf
params = {'min_samples_leaf': range(1, 100, 10)}
hpt(params,X_train_pca,y_train,"accuracy")
We will use values between 10 and 20 to train our model as there is a steep dip after 20.
# Tuning min_samples_split
params = {'min_samples_split': range(10, 100, 10)}
hpt(params,X_train_pca,y_train,"accuracy")
We will use values between 10 and 20 as the first dip is at 20.
# Tuning n_estimators
params = {'n_estimators': range(50, 1000, 100)}
hpt(params,X_train_pca,y_train,"accuracy")
We will use 150 since the test accuracy is slightly dipping post that.
# Tuning max_features
params = {'max_features': range(10,50,10)}
hpt(params,X_train_pca,y_train,"accuracy",5)
We can use 15 or log2 for max features
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 100)
params = {
'max_depth': [12,17],
'min_samples_leaf': [10,20],
'min_samples_split': [10,20],
'n_estimators': [150],
'max_features': [15]
}
# Create a based model
rfc1 = RandomForestClassifier(random_state=42, n_jobs=-1)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc1,
param_grid = params,
cv = folds,
n_jobs = -1,
verbose = 1,
scoring = "accuracy",
refit = True,
return_train_score=True)
# Fit the model
grid_search.fit(X_train_pca, y_train)
Fitting 3 folds for each of 8 candidates, totalling 24 fits
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
n_jobs=-1,
param_grid={'max_depth': [12, 17], 'max_features': [15],
'min_samples_leaf': [10, 20],
'min_samples_split': [10, 20], 'n_estimators': [150]},
return_train_score=True, scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
n_jobs=-1,
param_grid={'max_depth': [12, 17], 'max_features': [15],
'min_samples_leaf': [10, 20],
'min_samples_split': [10, 20], 'n_estimators': [150]},
return_train_score=True, scoring='accuracy', verbose=1)RandomForestClassifier(n_jobs=-1, random_state=42)
RandomForestClassifier(n_jobs=-1, random_state=42)
# print best hyperparameters
print("Best AUC: ", grid_search.best_score_)
print("Best hyperparameters: ", grid_search.best_params_)
Best AUC: 0.9119480580874569
Best hyperparameters: {'max_depth': 17, 'max_features': 15, 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 150}
# Building the model with optimal hyperparameters
## check these values
rf1_pca_model = RandomForestClassifier(bootstrap=True,
max_depth=17,
min_samples_leaf=10,
min_samples_split=10,
max_features=15,
n_estimators=150)
rf1_pca_model.fit(X_train_pca, y_train)
# Predictions on the train set
y_train_pred = rf1_pca_model.predict(X_train_pca)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = rf1_pca_model.predict(X_test_pca)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[14625 775] | | | [ 348 15052]] | +--------------------+-----------------+ | Accuracy | 0.964 | +--------------------+-----------------+ | Sensitivity/Recall | 0.977 | +--------------------+-----------------+ | Specificity | 0.95 | +--------------------+-----------------+ | Precision | 0.951 | +--------------------+-----------------+ | F1-score | 0.964 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3556 307] | | | [ 111 226]] | +--------------------+---------------+ | Accuracy | 0.9 | +--------------------+---------------+ | Sensitivity/Recall | 0.671 | +--------------------+---------------+ | Specificity | 0.921 | +--------------------+---------------+ | Precision | 0.424 | +--------------------+---------------+ | F1-score | 0.52 | +--------------------+---------------+
# Building the model with optimal hyperparameters
## check these values
rf3_pca_model = RandomForestClassifier(bootstrap=True,
min_samples_split=10,
max_features='log2',
#criterion='entropy',
class_weight='balanced',
n_estimators=200)
rf3_pca_model.fit(X_train_pca, y_train)
# Predictions on the train set
y_train_pred = rf3_pca_model.predict(X_train_pca)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = rf3_pca_model.predict(X_test_pca)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[15269 131] | | | [ 28 15372]] | +--------------------+-----------------+ | Accuracy | 0.995 | +--------------------+-----------------+ | Sensitivity/Recall | 0.998 | +--------------------+-----------------+ | Specificity | 0.991 | +--------------------+-----------------+ | Precision | 0.992 | +--------------------+-----------------+ | F1-score | 0.995 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3639 224] | | | [ 145 192]] | +--------------------+---------------+ | Accuracy | 0.912 | +--------------------+---------------+ | Sensitivity/Recall | 0.57 | +--------------------+---------------+ | Specificity | 0.942 | +--------------------+---------------+ | Precision | 0.462 | +--------------------+---------------+ | F1-score | 0.51 | +--------------------+---------------+
# Basic XGBoost to check accuracy
xgb = XGBClassifier()
xgb.fit(X_train_pca, y_train)
y_test_pred = xgb.predict(X_test_pca)
accuracy = round(metrics.accuracy_score(y_test, y_test_pred),3)
print("Accuracy of basic XGB model is:", accuracy)
Accuracy of basic XGB model is: 0.905
Basic accuracy of XGB is very good. Therefore, let's tune the parameters for it.
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 100)
params = {
'learning_rate': [0.1,0.2,0.3],
'max_depth': [2,5],
'n_estimators': [200],
'subsample':[0.3,0.4,0.5,1.0]
}
# Create a xbg model
xgb1 = XGBClassifier(random_state=42, n_jobs=-1)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgb1,
param_grid = params,
cv = folds,
n_jobs = -1,
verbose = 1,
scoring = "accuracy",
refit = True,
return_train_score=True)
# Fit the model
grid_search.fit(X_train_pca, y_train)
Fitting 3 folds for each of 24 candidates, totalling 72 fits
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
grow_policy=None, importance_ty...
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None,
n_jobs=-1, num_parallel_tree=None,
random_state=42, ...),
n_jobs=-1,
param_grid={'learning_rate': [0.1, 0.2, 0.3], 'max_depth': [2, 5],
'n_estimators': [200],
'subsample': [0.3, 0.4, 0.5, 1.0]},
return_train_score=True, scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
grow_policy=None, importance_ty...
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None,
n_jobs=-1, num_parallel_tree=None,
random_state=42, ...),
n_jobs=-1,
param_grid={'learning_rate': [0.1, 0.2, 0.3], 'max_depth': [2, 5],
'n_estimators': [200],
'subsample': [0.3, 0.4, 0.5, 1.0]},
return_train_score=True, scoring='accuracy', verbose=1)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=-1,
num_parallel_tree=None, random_state=42, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=-1,
num_parallel_tree=None, random_state=42, ...)# print best hyperparameters
print("Best AUC: ", grid_search.best_score_)
print("Best hyperparameters: ", grid_search.best_params_)
Best AUC: 0.9449349868100266
Best hyperparameters: {'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
# Building the model with optimal hyperparameters
## check these values
xgb1_pca_model = XGBClassifier(learning_rate = 0.3,
max_depth=5,
subsample = 1.0,
n_estimators=200)
xgb1_pca_model.fit(X_train_pca, y_train)
# Predictions on the train set
y_train_pred = xgb1_pca_model.predict(X_train_pca)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = xgb1_pca_model.predict(X_test_pca)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[15255 145] | | | [ 8 15392]] | +--------------------+-----------------+ | Accuracy | 0.995 | +--------------------+-----------------+ | Sensitivity/Recall | 0.999 | +--------------------+-----------------+ | Specificity | 0.991 | +--------------------+-----------------+ | Precision | 0.991 | +--------------------+-----------------+ | F1-score | 0.995 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3587 276] | | | [ 132 205]] | +--------------------+---------------+ | Accuracy | 0.903 | +--------------------+---------------+ | Sensitivity/Recall | 0.608 | +--------------------+---------------+ | Specificity | 0.929 | +--------------------+---------------+ | Precision | 0.426 | +--------------------+---------------+ | F1-score | 0.501 | +--------------------+---------------+
While using PCA, we saw above how well Random Forest has performed. Therefore, we will use Random Forest to identify important features that cause a customer to churn.
rf = RandomForestClassifier(n_estimators=10, max_depth=4, max_features=5, random_state=100, oob_score=True)
rf.fit(X_train, y_train)
rf.oob_score_
0.8417532467532467
y_probs = rf.predict_proba(X_test)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
# Calculate AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
# Tuning max_depth
params = {'max_depth': range(2, 40, 5)}
hpt(params,X_train,y_train,'roc_auc')
We will use 12 and 17 for max_depth.
# Tuning min_samples_leaf
params = {'min_samples_leaf': range(1, 100, 10)}
hpt(params,X_train,y_train,'roc_auc')
We will use values 10 and 20 for min_samples_leaf.
# Tuning min_samples_split
params = {'min_samples_split': range(10, 100, 10)}
hpt(params,X_train,y_train,'roc_auc')
We will use values between 20 and 30 for min_samples_split.
# Tuning n_estimators
params = {'n_estimators': range(50, 200, 50)}
hpt(params,X_train,y_train,'roc_auc')
We will use 100 estimators.
# Tuning max_features
params = {'max_features': range(10,50,10)}
hpt(params,X_train,y_train,'roc_auc',5)
We will use 15 max_features
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 100)
params = {
'max_depth': [12,17],
'min_samples_leaf': [10,20],
'min_samples_split': [20,30],
'n_estimators': [100],
'max_features': [15]
}
# Create a based model
rfc = RandomForestClassifier(random_state=42, n_jobs=-1)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc,
param_grid = params,
cv = folds,
n_jobs = -1,
verbose = 1,
scoring = "roc_auc",
refit = True,
return_train_score=True)
# Fit the model
grid_search.fit(X_train, y_train)
Fitting 3 folds for each of 8 candidates, totalling 24 fits
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
n_jobs=-1,
param_grid={'max_depth': [12, 17], 'max_features': [15],
'min_samples_leaf': [10, 20],
'min_samples_split': [20, 30], 'n_estimators': [100]},
return_train_score=True, scoring='roc_auc', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=True),
estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
n_jobs=-1,
param_grid={'max_depth': [12, 17], 'max_features': [15],
'min_samples_leaf': [10, 20],
'min_samples_split': [20, 30], 'n_estimators': [100]},
return_train_score=True, scoring='roc_auc', verbose=1)RandomForestClassifier(n_jobs=-1, random_state=42)
RandomForestClassifier(n_jobs=-1, random_state=42)
# print best hyperparameters
print("Best AUC: ", grid_search.best_score_)
print("Best hyperparameters: ", grid_search.best_params_)
Best AUC: 0.9804101070954854
Best hyperparameters: {'max_depth': 17, 'max_features': 15, 'min_samples_leaf': 10, 'min_samples_split': 20, 'n_estimators': 100}
rfc1_model = RandomForestClassifier(bootstrap=True,
max_depth=17,
min_samples_leaf=10,
min_samples_split=20,
max_features=15,
n_estimators=100)
rfc1_model.fit(X_train, y_train)
# Predictions on the train set
y_train_pred = rfc1_model.predict(X_train)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = rfc1_model.predict(X_test)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[14743 657] | | | [ 332 15068]] | +--------------------+-----------------+ | Accuracy | 0.968 | +--------------------+-----------------+ | Sensitivity/Recall | 0.978 | +--------------------+-----------------+ | Specificity | 0.957 | +--------------------+-----------------+ | Precision | 0.958 | +--------------------+-----------------+ | F1-score | 0.968 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3603 260] | | | [ 91 246]] | +--------------------+---------------+ | Accuracy | 0.916 | +--------------------+---------------+ | Sensitivity/Recall | 0.73 | +--------------------+---------------+ | Specificity | 0.933 | +--------------------+---------------+ | Precision | 0.486 | +--------------------+---------------+ | F1-score | 0.584 | +--------------------+---------------+
y_probs = rfc1_model.predict_proba(X_test)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
# Calculate AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
rf_best = grid_search.best_estimator_
rf_best.feature_importances_
array([0.07429819, 0.00499139, 0.01862225, 0.06838385, 0.11619769,
0.02126155, 0.03342574, 0.00193895, 0.0019986 , 0.00463145,
0.00432121, 0.00036991, 0.00229602, 0.00417765, 0. ,
0.05996093, 0.11860825, 0.00411264, 0.00250917, 0.00581508,
0.00146499, 0.00417055, 0.00222408, 0.00200574, 0.01133603,
0.07389465, 0.04585511, 0.03200324, 0.03949407, 0.00857509,
0.0088965 , 0.00314294, 0.00545635, 0.0018638 , 0.00045199,
0.00569899, 0.00548926, 0.0283542 , 0.00220699, 0.00358296,
0.00802657, 0.0042657 , 0.00466924, 0.00443595, 0.00562377,
0.00389032, 0.00392108, 0.00586191, 0.00230565, 0.00496061,
0.00531422, 0.00398655, 0.01061229, 0.00376367, 0.00729996,
0.00471356, 0.00322928, 0.00217339, 0.00428024, 0.00559263,
0.00485166, 0.00735617, 0.0043419 , 0.00416439, 0.0020631 ,
0.00118487, 0.00722192, 0.0060248 , 0.00566229, 0.0036704 ,
0.00516108, 0.00407848, 0.00407063, 0.00257502, 0.0045587 ])
imp_df = pd.DataFrame({
"Varname": X_train.columns,
"Imp": rf_best.feature_importances_
})
imp_df.sort_values(by="Imp", ascending=False)
| Varname | Imp | |
|---|---|---|
| 16 | loc_ic_t2m_mou_8 | 0.118608 |
| 4 | roam_og_mou_8 | 0.116198 |
| 0 | arpu_8 | 0.074298 |
| 25 | total_rech_amt_8 | 0.073895 |
| 3 | roam_ic_mou_8 | 0.068384 |
| 15 | loc_ic_t2t_mou_8 | 0.059961 |
| 26 | max_rech_amt_8 | 0.045855 |
| 28 | max_rech_data_8 | 0.039494 |
| 6 | loc_og_t2m_mou_8 | 0.033426 |
| 27 | last_day_rch_amt_8 | 0.032003 |
| 37 | total_rech_amt_data_8 | 0.028354 |
| 5 | loc_og_t2t_mou_8 | 0.021262 |
| 2 | offnet_mou_8 | 0.018622 |
| 24 | total_rech_num_8 | 0.011336 |
| 52 | avg_roam_og_mou_6_7 | 0.010612 |
| 30 | vol_3g_mb_8 | 0.008896 |
| 29 | vol_2g_mb_8 | 0.008575 |
| 40 | avg_std_og_t2t_mou_6_7 | 0.008027 |
| 61 | avg_total_rech_num_6_7 | 0.007356 |
| 54 | avg_loc_og_t2c_mou_6_7 | 0.007300 |
| 66 | avg_loc_ic_t2m_mou_6_7 | 0.007222 |
| 67 | avg_roam_ic_mou_6_7 | 0.006025 |
| 47 | avg_total_rech_amt_6_7 | 0.005862 |
| 19 | std_ic_t2m_mou_8 | 0.005815 |
| 35 | aon | 0.005699 |
| 68 | avg_max_rech_amt_6_7 | 0.005662 |
| 44 | avg_spl_og_mou_6_7 | 0.005624 |
| 59 | avg_arpu_6_7 | 0.005593 |
| 36 | 3g_vbc_8 | 0.005489 |
| 32 | sachet_2g_8 | 0.005456 |
| 50 | avg_isd_ic_mou_6_7 | 0.005314 |
| 70 | avg_max_rech_data_6_7 | 0.005161 |
| 1 | onnet_mou_8 | 0.004991 |
| 49 | avg_monthly_2g_6_7 | 0.004961 |
| 60 | avg_last_day_rch_amt_6_7 | 0.004852 |
| 55 | avg_std_og_t2m_mou_6_7 | 0.004714 |
| 42 | avg_std_ic_t2t_mou_6_7 | 0.004669 |
| 9 | std_og_t2t_mou_8 | 0.004631 |
| 74 | avg_loc_ic_t2t_mou_6_7 | 0.004559 |
| 43 | avg_3g_vbc_6_7 | 0.004436 |
| 62 | avg_onnet_mou_6_7 | 0.004342 |
| 10 | std_og_t2m_mou_8 | 0.004321 |
| 58 | avg_loc_og_t2m_mou_6_7 | 0.004280 |
| 41 | avg_total_rech_amt_data_6_7 | 0.004266 |
| 13 | spl_og_mou_8 | 0.004178 |
| 21 | spl_ic_mou_8 | 0.004171 |
| 63 | avg_vol_3g_mb_6_7 | 0.004164 |
| 17 | loc_ic_t2f_mou_8 | 0.004113 |
| 71 | avg_offnet_mou_6_7 | 0.004078 |
| 72 | avg_loc_og_t2t_mou_6_7 | 0.004071 |
| 51 | avg_loc_og_t2f_mou_6_7 | 0.003987 |
| 46 | avg_loc_ic_t2f_mou_6_7 | 0.003921 |
| 45 | avg_ic_others_6_7 | 0.003890 |
| 53 | avg_spl_ic_mou_6_7 | 0.003764 |
| 69 | avg_vol_2g_mb_6_7 | 0.003670 |
| 39 | avg_std_ic_t2m_mou_6_7 | 0.003583 |
| 56 | avg_isd_og_mou_6_7 | 0.003229 |
| 31 | monthly_2g_8 | 0.003143 |
| 73 | avg_monthly_3g_6_7 | 0.002575 |
| 18 | std_ic_t2t_mou_8 | 0.002509 |
| 48 | avg_std_ic_t2f_mou_6_7 | 0.002306 |
| 12 | isd_og_mou_8 | 0.002296 |
| 22 | isd_ic_mou_8 | 0.002224 |
| 38 | avg_og_others_6_7 | 0.002207 |
| 57 | avg_sachet_2g_6_7 | 0.002173 |
| 64 | avg_sachet_3g_6_7 | 0.002063 |
| 23 | ic_others_8 | 0.002006 |
| 8 | loc_og_t2c_mou_8 | 0.001999 |
| 7 | loc_og_t2f_mou_8 | 0.001939 |
| 33 | monthly_3g_8 | 0.001864 |
| 20 | std_ic_t2f_mou_8 | 0.001465 |
| 65 | avg_std_og_t2f_mou_6_7 | 0.001185 |
| 34 | sachet_3g_8 | 0.000452 |
| 11 | std_og_t2f_mou_8 | 0.000370 |
| 14 | og_others_8 | 0.000000 |
rfc2_model = RandomForestClassifier(bootstrap=True,
min_samples_split=15,
max_features='log2',
criterion='entropy',
class_weight='balanced',
n_estimators=150)
rfc2_model.fit(X_train, y_train)
# Predictions on the train set
y_train_pred = rfc2_model.predict(X_train)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = rfc2_model.predict(X_test)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[15158 242] | | | [ 87 15313]] | +--------------------+-----------------+ | Accuracy | 0.989 | +--------------------+-----------------+ | Sensitivity/Recall | 0.994 | +--------------------+-----------------+ | Specificity | 0.984 | +--------------------+-----------------+ | Precision | 0.984 | +--------------------+-----------------+ | F1-score | 0.989 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3665 198] | | | [ 102 235]] | +--------------------+---------------+ | Accuracy | 0.929 | +--------------------+---------------+ | Sensitivity/Recall | 0.697 | +--------------------+---------------+ | Specificity | 0.949 | +--------------------+---------------+ | Precision | 0.543 | +--------------------+---------------+ | F1-score | 0.61 | +--------------------+---------------+
y_probs = rfc2_model.predict_proba(X_test)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
# Calculate AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
The above ROC curve follows closer to the left-hand border and then the top border of the ROC space, therefore this is a more accurate test.
# More finetuning
rfc3_model = RandomForestClassifier(bootstrap=True,
min_samples_split=15,
max_features='log2',
criterion='entropy',
class_weight='balanced',
n_estimators=150)
rfc3_model.fit(X_train, y_train)
# Predictions on the train set
y_train_pred = rfc3_model.predict(X_train)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = rfc3_model.predict(X_test)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[15160 240] | | | [ 88 15312]] | +--------------------+-----------------+ | Accuracy | 0.989 | +--------------------+-----------------+ | Sensitivity/Recall | 0.994 | +--------------------+-----------------+ | Specificity | 0.984 | +--------------------+-----------------+ | Precision | 0.985 | +--------------------+-----------------+ | F1-score | 0.989 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3654 209] | | | [ 104 233]] | +--------------------+---------------+ | Accuracy | 0.925 | +--------------------+---------------+ | Sensitivity/Recall | 0.691 | +--------------------+---------------+ | Specificity | 0.946 | +--------------------+---------------+ | Precision | 0.527 | +--------------------+---------------+ | F1-score | 0.598 | +--------------------+---------------+
# More finetuning
rfc4_model = RandomForestClassifier(
#min_samples_split=15,
max_features='log2',
#criterion='entropy',
class_weight='balanced',
n_estimators=500)
rfc4_model.fit(X_train, y_train)
# Predictions on the train set
y_train_pred = rfc4_model.predict(X_train)
print("\nTraining set:")
model_metrics(y_train, y_train_pred)
# Prediction on the test set
y_test_pred = rfc4_model.predict(X_test)
print("\nValidation set:")
model_metrics(y_test, y_test_pred)
Training set: +--------------------+-----------------+ | Metric | Value | +====================+=================+ | Confusion matrix | [[15400 0] | | | [ 0 15400]] | +--------------------+-----------------+ | Accuracy | 1.0 | +--------------------+-----------------+ | Sensitivity/Recall | 1.0 | +--------------------+-----------------+ | Specificity | 1.0 | +--------------------+-----------------+ | Precision | 1.0 | +--------------------+-----------------+ | F1-score | 1.0 | +--------------------+-----------------+ Validation set: +--------------------+---------------+ | Metric | Value | +====================+===============+ | Confusion matrix | [[3698 165] | | | [ 109 228]] | +--------------------+---------------+ | Accuracy | 0.935 | +--------------------+---------------+ | Sensitivity/Recall | 0.677 | +--------------------+---------------+ | Specificity | 0.957 | +--------------------+---------------+ | Precision | 0.58 | +--------------------+---------------+ | F1-score | 0.625 | +--------------------+---------------+
rfc4_model for predictions on the test data. It will be used to predict whether a high-value customer will churn or not, in near future (i.e. churn phase). By knowing this, the company can take action steps such as providing special plans, discounts on recharge etc.rfc1_model is being used to identify the top predictors of customer churn. It will be used to identify important variables that are strong predictors of churn. These variables may also indicate why customers choose to switch to other networks. The top predictors are displayed below in the next section.log1_pca_model - This is a logistic regression model. Although this model has lower accuracy compared to other models, the sensitivity of this model is more than 80% which is the highest among all models. We would recommend this model as it identifies customers who'll definitely churn with more accuracy as compared to the ones who'll not churn. This is given by the metric sensitivity. Logistic regression models are simple and consume lesser computational resource.imp_df.sort_values(by="Imp", ascending=False)
| Varname | Imp | |
|---|---|---|
| 16 | loc_ic_t2m_mou_8 | 0.118608 |
| 4 | roam_og_mou_8 | 0.116198 |
| 0 | arpu_8 | 0.074298 |
| 25 | total_rech_amt_8 | 0.073895 |
| 3 | roam_ic_mou_8 | 0.068384 |
| 15 | loc_ic_t2t_mou_8 | 0.059961 |
| 26 | max_rech_amt_8 | 0.045855 |
| 28 | max_rech_data_8 | 0.039494 |
| 6 | loc_og_t2m_mou_8 | 0.033426 |
| 27 | last_day_rch_amt_8 | 0.032003 |
| 37 | total_rech_amt_data_8 | 0.028354 |
| 5 | loc_og_t2t_mou_8 | 0.021262 |
| 2 | offnet_mou_8 | 0.018622 |
| 24 | total_rech_num_8 | 0.011336 |
| 52 | avg_roam_og_mou_6_7 | 0.010612 |
| 30 | vol_3g_mb_8 | 0.008896 |
| 29 | vol_2g_mb_8 | 0.008575 |
| 40 | avg_std_og_t2t_mou_6_7 | 0.008027 |
| 61 | avg_total_rech_num_6_7 | 0.007356 |
| 54 | avg_loc_og_t2c_mou_6_7 | 0.007300 |
| 66 | avg_loc_ic_t2m_mou_6_7 | 0.007222 |
| 67 | avg_roam_ic_mou_6_7 | 0.006025 |
| 47 | avg_total_rech_amt_6_7 | 0.005862 |
| 19 | std_ic_t2m_mou_8 | 0.005815 |
| 35 | aon | 0.005699 |
| 68 | avg_max_rech_amt_6_7 | 0.005662 |
| 44 | avg_spl_og_mou_6_7 | 0.005624 |
| 59 | avg_arpu_6_7 | 0.005593 |
| 36 | 3g_vbc_8 | 0.005489 |
| 32 | sachet_2g_8 | 0.005456 |
| 50 | avg_isd_ic_mou_6_7 | 0.005314 |
| 70 | avg_max_rech_data_6_7 | 0.005161 |
| 1 | onnet_mou_8 | 0.004991 |
| 49 | avg_monthly_2g_6_7 | 0.004961 |
| 60 | avg_last_day_rch_amt_6_7 | 0.004852 |
| 55 | avg_std_og_t2m_mou_6_7 | 0.004714 |
| 42 | avg_std_ic_t2t_mou_6_7 | 0.004669 |
| 9 | std_og_t2t_mou_8 | 0.004631 |
| 74 | avg_loc_ic_t2t_mou_6_7 | 0.004559 |
| 43 | avg_3g_vbc_6_7 | 0.004436 |
| 62 | avg_onnet_mou_6_7 | 0.004342 |
| 10 | std_og_t2m_mou_8 | 0.004321 |
| 58 | avg_loc_og_t2m_mou_6_7 | 0.004280 |
| 41 | avg_total_rech_amt_data_6_7 | 0.004266 |
| 13 | spl_og_mou_8 | 0.004178 |
| 21 | spl_ic_mou_8 | 0.004171 |
| 63 | avg_vol_3g_mb_6_7 | 0.004164 |
| 17 | loc_ic_t2f_mou_8 | 0.004113 |
| 71 | avg_offnet_mou_6_7 | 0.004078 |
| 72 | avg_loc_og_t2t_mou_6_7 | 0.004071 |
| 51 | avg_loc_og_t2f_mou_6_7 | 0.003987 |
| 46 | avg_loc_ic_t2f_mou_6_7 | 0.003921 |
| 45 | avg_ic_others_6_7 | 0.003890 |
| 53 | avg_spl_ic_mou_6_7 | 0.003764 |
| 69 | avg_vol_2g_mb_6_7 | 0.003670 |
| 39 | avg_std_ic_t2m_mou_6_7 | 0.003583 |
| 56 | avg_isd_og_mou_6_7 | 0.003229 |
| 31 | monthly_2g_8 | 0.003143 |
| 73 | avg_monthly_3g_6_7 | 0.002575 |
| 18 | std_ic_t2t_mou_8 | 0.002509 |
| 48 | avg_std_ic_t2f_mou_6_7 | 0.002306 |
| 12 | isd_og_mou_8 | 0.002296 |
| 22 | isd_ic_mou_8 | 0.002224 |
| 38 | avg_og_others_6_7 | 0.002207 |
| 57 | avg_sachet_2g_6_7 | 0.002173 |
| 64 | avg_sachet_3g_6_7 | 0.002063 |
| 23 | ic_others_8 | 0.002006 |
| 8 | loc_og_t2c_mou_8 | 0.001999 |
| 7 | loc_og_t2f_mou_8 | 0.001939 |
| 33 | monthly_3g_8 | 0.001864 |
| 20 | std_ic_t2f_mou_8 | 0.001465 |
| 65 | avg_std_og_t2f_mou_6_7 | 0.001185 |
| 34 | sachet_3g_8 | 0.000452 |
| 11 | std_og_t2f_mou_8 | 0.000370 |
| 14 | og_others_8 | 0.000000 |
From EDA, we observe the following:
Recommendation: Bring in long term packs like half yearly or yearly ones at attractive prices - Customers purchasing these packs will tend to churn less.Recommendation: For new customers, provide attractive offers and schemes that will make them stay longer with the operator.Recommendation: Invest into bring attractive call-related packs. Whenever customers buy smaller call related packs, the operator must be alerted for a potential churn.Recommendation: Whenever a sudden dip in usage is noticed, the operator must be alerted for a potential churn.kaggle = pd.read_csv("test.csv")
kaggle.head()
| id | circle_id | loc_og_t2o_mou | std_og_t2o_mou | loc_ic_t2o_mou | last_date_of_month_6 | last_date_of_month_7 | last_date_of_month_8 | arpu_6 | arpu_7 | arpu_8 | onnet_mou_6 | onnet_mou_7 | onnet_mou_8 | offnet_mou_6 | offnet_mou_7 | offnet_mou_8 | roam_ic_mou_6 | roam_ic_mou_7 | roam_ic_mou_8 | roam_og_mou_6 | roam_og_mou_7 | roam_og_mou_8 | loc_og_t2t_mou_6 | loc_og_t2t_mou_7 | loc_og_t2t_mou_8 | loc_og_t2m_mou_6 | loc_og_t2m_mou_7 | loc_og_t2m_mou_8 | loc_og_t2f_mou_6 | loc_og_t2f_mou_7 | loc_og_t2f_mou_8 | loc_og_t2c_mou_6 | loc_og_t2c_mou_7 | loc_og_t2c_mou_8 | loc_og_mou_6 | loc_og_mou_7 | loc_og_mou_8 | std_og_t2t_mou_6 | std_og_t2t_mou_7 | std_og_t2t_mou_8 | std_og_t2m_mou_6 | std_og_t2m_mou_7 | std_og_t2m_mou_8 | std_og_t2f_mou_6 | std_og_t2f_mou_7 | std_og_t2f_mou_8 | std_og_t2c_mou_6 | std_og_t2c_mou_7 | std_og_t2c_mou_8 | std_og_mou_6 | std_og_mou_7 | std_og_mou_8 | isd_og_mou_6 | isd_og_mou_7 | isd_og_mou_8 | spl_og_mou_6 | spl_og_mou_7 | spl_og_mou_8 | og_others_6 | og_others_7 | og_others_8 | total_og_mou_6 | total_og_mou_7 | total_og_mou_8 | loc_ic_t2t_mou_6 | loc_ic_t2t_mou_7 | loc_ic_t2t_mou_8 | loc_ic_t2m_mou_6 | loc_ic_t2m_mou_7 | loc_ic_t2m_mou_8 | loc_ic_t2f_mou_6 | loc_ic_t2f_mou_7 | loc_ic_t2f_mou_8 | loc_ic_mou_6 | loc_ic_mou_7 | loc_ic_mou_8 | std_ic_t2t_mou_6 | std_ic_t2t_mou_7 | std_ic_t2t_mou_8 | std_ic_t2m_mou_6 | std_ic_t2m_mou_7 | std_ic_t2m_mou_8 | std_ic_t2f_mou_6 | std_ic_t2f_mou_7 | std_ic_t2f_mou_8 | std_ic_t2o_mou_6 | std_ic_t2o_mou_7 | std_ic_t2o_mou_8 | std_ic_mou_6 | std_ic_mou_7 | std_ic_mou_8 | total_ic_mou_6 | total_ic_mou_7 | total_ic_mou_8 | spl_ic_mou_6 | spl_ic_mou_7 | spl_ic_mou_8 | isd_ic_mou_6 | isd_ic_mou_7 | isd_ic_mou_8 | ic_others_6 | ic_others_7 | ic_others_8 | total_rech_num_6 | total_rech_num_7 | total_rech_num_8 | total_rech_amt_6 | total_rech_amt_7 | total_rech_amt_8 | max_rech_amt_6 | max_rech_amt_7 | max_rech_amt_8 | date_of_last_rech_6 | date_of_last_rech_7 | date_of_last_rech_8 | last_day_rch_amt_6 | last_day_rch_amt_7 | last_day_rch_amt_8 | date_of_last_rech_data_6 | date_of_last_rech_data_7 | date_of_last_rech_data_8 | total_rech_data_6 | total_rech_data_7 | total_rech_data_8 | max_rech_data_6 | max_rech_data_7 | max_rech_data_8 | count_rech_2g_6 | count_rech_2g_7 | count_rech_2g_8 | count_rech_3g_6 | count_rech_3g_7 | count_rech_3g_8 | av_rech_amt_data_6 | av_rech_amt_data_7 | av_rech_amt_data_8 | vol_2g_mb_6 | vol_2g_mb_7 | vol_2g_mb_8 | vol_3g_mb_6 | vol_3g_mb_7 | vol_3g_mb_8 | arpu_3g_6 | arpu_3g_7 | arpu_3g_8 | arpu_2g_6 | arpu_2g_7 | arpu_2g_8 | night_pck_user_6 | night_pck_user_7 | night_pck_user_8 | monthly_2g_6 | monthly_2g_7 | monthly_2g_8 | sachet_2g_6 | sachet_2g_7 | sachet_2g_8 | monthly_3g_6 | monthly_3g_7 | monthly_3g_8 | sachet_3g_6 | sachet_3g_7 | sachet_3g_8 | fb_user_6 | fb_user_7 | fb_user_8 | aon | aug_vbc_3g | jul_vbc_3g | jun_vbc_3g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69999 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 91.882 | 65.330 | 64.445 | 31.78 | 20.23 | 23.11 | 60.16 | 32.16 | 34.83 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 24.88 | 20.23 | 21.06 | 18.13 | 10.89 | 8.36 | 0.00 | 13.58 | 0.00 | 0.0 | 0.00 | 0.03 | 43.01 | 44.71 | 29.43 | 6.90 | 0.00 | 2.05 | 42.03 | 7.68 | 26.43 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 48.93 | 7.68 | 28.48 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.03 | 0.0 | 0.0 | 0.0 | 91.94 | 52.39 | 57.94 | 30.33 | 37.56 | 21.98 | 10.21 | 4.59 | 9.53 | 0.26 | 0.00 | 0.00 | 40.81 | 42.16 | 31.51 | 0.00 | 0.00 | 0.00 | 0.36 | 1.04 | 4.34 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.36 | 1.04 | 4.34 | 41.73 | 43.56 | 36.26 | 0.54 | 0.34 | 0.39 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.00 | 5 | 5 | 4 | 103 | 90 | 60 | 50 | 30 | 30 | 6/21/2014 | 7/26/2014 | 8/24/2014 | 30 | 30 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 1692 | 0.00 | 0.00 | 0.00 |
| 1 | 70000 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 414.168 | 515.568 | 360.868 | 75.51 | 41.21 | 19.84 | 474.34 | 621.84 | 394.94 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 75.51 | 41.21 | 19.84 | 473.61 | 598.08 | 377.26 | 0.73 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 549.86 | 639.29 | 397.11 | 0.00 | 0.00 | 0.00 | 0.00 | 23.76 | 17.68 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 23.76 | 17.68 | 0.0 | 0.0 | 0.8 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 549.86 | 663.06 | 415.59 | 19.99 | 26.95 | 2.61 | 160.19 | 122.29 | 184.81 | 1.49 | 0.00 | 0.00 | 181.69 | 149.24 | 187.43 | 0.00 | 0.00 | 0.00 | 0.00 | 12.51 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 12.51 | 0.00 | 296.33 | 339.64 | 281.66 | 0.00 | 0.00 | 0.00 | 114.63 | 177.88 | 94.23 | 0.0 | 0.0 | 0.00 | 5 | 4 | 5 | 500 | 500 | 500 | 250 | 250 | 250 | 6/19/2014 | 7/16/2014 | 8/24/2014 | 250 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 2533 | 0.00 | 0.00 | 0.00 |
| 2 | 70001 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 329.844 | 434.884 | 746.239 | 7.54 | 7.86 | 8.40 | 16.98 | 45.81 | 45.04 | 22.81 | 103.38 | 26.08 | 24.53 | 53.68 | 54.44 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.00 | 6 | 9 | 5 | 500 | 1000 | 1000 | 300 | 500 | 500 | 6/29/2014 | 7/27/2014 | 8/28/2014 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 277 | 525.61 | 758.41 | 241.84 |
| 3 | 70002 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 43.550 | 171.390 | 24.400 | 5.31 | 2.16 | 0.00 | 40.04 | 205.01 | 24.01 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 5.31 | 0.00 | 0.00 | 2.94 | 98.61 | 20.51 | 0.00 | 0.00 | 2.35 | 0.0 | 6.18 | 0.00 | 8.26 | 98.61 | 22.86 | 0.00 | 2.16 | 0.00 | 37.09 | 94.36 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 37.09 | 96.53 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 12.03 | 1.15 | 0.0 | 0.0 | 0.0 | 45.36 | 207.18 | 24.01 | 58.11 | 54.64 | 23.04 | 487.94 | 449.83 | 506.94 | 0.00 | 0.38 | 1.64 | 546.06 | 504.86 | 531.64 | 0.00 | 4.26 | 0.00 | 9.63 | 11.88 | 8.83 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 9.63 | 16.14 | 8.83 | 555.69 | 522.44 | 549.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.43 | 8.65 | 0.0 | 0.0 | 0.00 | 3 | 5 | 2 | 110 | 260 | 0 | 110 | 150 | 0 | 6/25/2014 | 7/30/2014 | 8/24/2014 | 110 | 150 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 1244 | 0.00 | 0.00 | 0.00 |
| 4 | 70003 | 109 | 0.0 | 0.0 | 0.0 | 6/30/2014 | 7/31/2014 | 8/31/2014 | 306.854 | 406.289 | 413.329 | 450.93 | 609.03 | 700.68 | 60.94 | 23.84 | 74.16 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.45 | 0.78 | 14.56 | 2.39 | 2.66 | 10.94 | 0.00 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 2.84 | 3.44 | 25.51 | 450.48 | 608.24 | 686.11 | 58.54 | 21.18 | 63.18 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 509.03 | 629.43 | 749.29 | 0.0 | 0.0 | 0.0 | 0.71 | 5.39 | 4.96 | 2.2 | 0.0 | 0.0 | 514.79 | 638.28 | 779.78 | 0.00 | 0.36 | 9.91 | 10.13 | 9.23 | 7.69 | 0.00 | 0.00 | 0.00 | 10.13 | 9.59 | 17.61 | 29.71 | 92.36 | 107.39 | 13.88 | 13.96 | 32.46 | 0.0 | 0.0 | 1.61 | 0.0 | 0.0 | 0.0 | 43.59 | 106.33 | 141.48 | 53.73 | 115.93 | 159.26 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.16 | 11 | 7 | 8 | 356 | 490 | 546 | 90 | 130 | 130 | 6/29/2014 | 7/29/2014 | 8/30/2014 | 50 | 130 | 130 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 462 | 0.00 | 0.00 | 0.00 |
kaggle.shape
(30000, 171)
# Column cleanup
# Rename columns
kaggle = kaggle.rename(columns = {'aug_vbc_3g':'3g_vbc_8','jul_vbc_3g':'3g_vbc_7','jun_vbc_3g':'3g_vbc_6'})
# Impute 0 for data recharge columns
kaggle['total_rech_data_6'] = kaggle['total_rech_data_6'].replace(np.NaN,0.0)
kaggle['total_rech_data_7'] = kaggle['total_rech_data_7'].replace(np.NaN,0.0)
kaggle['total_rech_data_8'] = kaggle['total_rech_data_8'].replace(np.NaN,0.0)
kaggle['av_rech_amt_data_6'] = kaggle['av_rech_amt_data_6'].replace(np.NaN,0.0)
kaggle['av_rech_amt_data_7'] = kaggle['av_rech_amt_data_7'].replace(np.NaN,0.0)
kaggle['av_rech_amt_data_8'] = kaggle['av_rech_amt_data_8'].replace(np.NaN,0.0)
# Create new column: total recharge amount for data: total_rech_amt_data
kaggle['total_rech_amt_data_6'] = kaggle.av_rech_amt_data_6 * kaggle.total_rech_data_6
kaggle['total_rech_amt_data_7'] = kaggle.av_rech_amt_data_7 * kaggle.total_rech_data_7
kaggle['total_rech_amt_data_8'] = kaggle.av_rech_amt_data_8 * kaggle.total_rech_data_8
# Impute 0 for max data recharge columns
kaggle['max_rech_data_6'] = kaggle['max_rech_data_6'].replace(np.NaN,0.0)
kaggle['max_rech_data_7'] = kaggle['max_rech_data_7'].replace(np.NaN,0.0)
kaggle['max_rech_data_8'] = kaggle['max_rech_data_8'].replace(np.NaN,0.0)
# Impute Minutes of Usage column
kaggle[col_mou] = kaggle[col_mou].replace(np.NaN,0.0)
# Impute incoming/outgoing columns
kaggle[col_og_ic] = kaggle[col_og_ic].replace(np.NaN,0.0)
# Average out months 6 and 7
for idx, col in enumerate(col_6_7_common):
avg_col_6_7 = "avg_"+col+"_6_7"
col_6 = col+"_6"
col_7 = col+"_7"
kaggle[avg_col_6_7] = (kaggle[col_6] + kaggle[col_7])/ 2
kaggle_pred = kaggle[final_column_list].copy()
kaggle_pred.shape
(30000, 75)
# Outlier Treatment
pct_change_99_1 = kaggle_pred.quantile(np.arange(0.9,1.01,0.01),axis=0, numeric_only=True).pct_change().mul(100).iloc[-1]
outlier_condition = pct_change_99_1 > 100
columns_with_outliers = pct_change_99_1[outlier_condition].index.values
for col in columns_with_outliers :
outlier_threshold = kaggle_pred[col].quantile(0.99)
condition = kaggle_pred[col] > outlier_threshold
kaggle_pred.loc[condition, col] = outlier_threshold
# Scale
kaggle_pred[final_column_list] = scaler.transform(kaggle_pred[final_column_list])
kaggle_pred.head()
| arpu_8 | onnet_mou_8 | offnet_mou_8 | roam_ic_mou_8 | roam_og_mou_8 | loc_og_t2t_mou_8 | loc_og_t2m_mou_8 | loc_og_t2f_mou_8 | loc_og_t2c_mou_8 | std_og_t2t_mou_8 | std_og_t2m_mou_8 | std_og_t2f_mou_8 | isd_og_mou_8 | spl_og_mou_8 | og_others_8 | loc_ic_t2t_mou_8 | loc_ic_t2m_mou_8 | loc_ic_t2f_mou_8 | std_ic_t2t_mou_8 | std_ic_t2m_mou_8 | std_ic_t2f_mou_8 | spl_ic_mou_8 | isd_ic_mou_8 | ic_others_8 | total_rech_num_8 | total_rech_amt_8 | max_rech_amt_8 | last_day_rch_amt_8 | max_rech_data_8 | vol_2g_mb_8 | vol_3g_mb_8 | monthly_2g_8 | sachet_2g_8 | monthly_3g_8 | sachet_3g_8 | aon | 3g_vbc_8 | total_rech_amt_data_8 | avg_og_others_6_7 | avg_std_ic_t2m_mou_6_7 | avg_std_og_t2t_mou_6_7 | avg_total_rech_amt_data_6_7 | avg_std_ic_t2t_mou_6_7 | avg_3g_vbc_6_7 | avg_spl_og_mou_6_7 | avg_ic_others_6_7 | avg_loc_ic_t2f_mou_6_7 | avg_total_rech_amt_6_7 | avg_std_ic_t2f_mou_6_7 | avg_monthly_2g_6_7 | avg_isd_ic_mou_6_7 | avg_loc_og_t2f_mou_6_7 | avg_roam_og_mou_6_7 | avg_spl_ic_mou_6_7 | avg_loc_og_t2c_mou_6_7 | avg_std_og_t2m_mou_6_7 | avg_isd_og_mou_6_7 | avg_sachet_2g_6_7 | avg_loc_og_t2m_mou_6_7 | avg_arpu_6_7 | avg_last_day_rch_amt_6_7 | avg_total_rech_num_6_7 | avg_onnet_mou_6_7 | avg_vol_3g_mb_6_7 | avg_sachet_3g_6_7 | avg_std_og_t2f_mou_6_7 | avg_loc_ic_t2m_mou_6_7 | avg_roam_ic_mou_6_7 | avg_max_rech_amt_6_7 | avg_vol_2g_mb_6_7 | avg_max_rech_data_6_7 | avg_offnet_mou_6_7 | avg_loc_og_t2t_mou_6_7 | avg_monthly_3g_6_7 | avg_loc_ic_t2t_mou_6_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.789460 | -0.418184 | -0.541440 | -0.309217 | -0.345146 | -0.209192 | -0.502761 | -0.327195 | -0.297952 | -0.360552 | -0.307597 | -0.187171 | -0.152370 | -0.418148 | 0.0 | -0.179451 | -0.530555 | -0.357595 | -0.315194 | -0.268584 | -0.224676 | 5.169981 | -0.229021 | -0.216127 | -0.443756 | -0.772632 | -0.629697 | -0.636077 | -0.530001 | -0.307769 | -0.356159 | -0.277974 | -0.337643 | -0.281795 | -0.190088 | 0.720061 | -0.351276 | -0.344829 | -0.430833 | -0.498593 | -0.517514 | -0.515336 | -0.450548 | -0.430986 | -0.609061 | -0.312114 | -0.422147 | -1.471579 | -0.293179 | -0.41431 | -0.306304 | 0.204236 | -0.366103 | 5.128793 | -0.450335 | -0.496410 | -0.212023 | -0.492124 | -0.647763 | -1.479153 | -0.688108 | -0.902716 | -0.604792 | -0.470776 | -0.326582 | -0.245369 | -0.742431 | -0.344064 | -0.941955 | -0.455492 | -0.743385 | -0.766474 | -0.305738 | -0.424651 | -0.183716 |
| 1 | -0.013426 | -0.427387 | 0.434662 | -0.309217 | -0.345146 | -0.220124 | 1.733102 | -0.327195 | -0.306055 | -0.367277 | -0.337788 | -0.187171 | 0.114281 | -0.421099 | 0.0 | -0.429804 | 0.661045 | -0.357595 | -0.315194 | -0.367630 | -0.224676 | -0.204051 | 3.488968 | -0.216127 | -0.311227 | 0.190799 | 0.926188 | -0.636077 | -0.530001 | -0.307769 | -0.356159 | -0.277974 | -0.337643 | -0.281795 | -0.190088 | 1.671295 | -0.351276 | -0.344829 | -0.430833 | -0.382935 | -0.526533 | -0.515336 | -0.450548 | -0.430986 | -0.609061 | -0.312114 | -0.398591 | -0.404664 | -0.293179 | -0.41431 | 4.989777 | -0.364628 | -0.366103 | -0.481483 | -0.450335 | -0.530292 | -0.212023 | -0.492124 | 2.345319 | -0.279376 | 0.368238 | -0.967979 | -0.528701 | -0.470776 | -0.326582 | -0.245369 | 0.162478 | -0.344064 | 0.707954 | -0.455492 | -0.743385 | 0.386454 | -0.025705 | -0.424651 | -0.309296 |
| 2 | 0.995474 | -0.459583 | -0.513765 | 0.371496 | 0.545028 | -0.397901 | -0.553430 | -0.327195 | -0.306055 | -0.367277 | -0.398793 | -0.187171 | -0.152370 | -0.421099 | 0.0 | -0.463537 | -0.595343 | -0.357595 | -0.315194 | -0.367630 | -0.224676 | -0.204051 | -0.229021 | -0.216127 | -0.311227 | 1.285606 | 2.694240 | -0.636077 | -0.530001 | -0.307769 | -0.356159 | -0.277974 | -0.337643 | -0.281795 | -0.190088 | -0.880409 | 1.513000 | -0.344829 | -0.430833 | -0.513168 | -0.526533 | -0.515336 | -0.450548 | 1.241259 | -0.609061 | -0.312114 | -0.427126 | 0.256373 | -0.293179 | -0.41431 | -0.306304 | -0.396945 | 0.171186 | -0.481483 | -0.450335 | -0.561314 | -0.212023 | -0.492124 | -0.731067 | -0.535643 | -1.021691 | -0.576404 | -0.647840 | -0.470776 | -0.326582 | -0.245369 | -0.792464 | 1.196480 | 1.886460 | -0.455492 | -0.743385 | -0.800389 | -0.482141 | -0.424651 | -0.590667 |
| 3 | -0.894298 | -0.483224 | -0.570768 | -0.309217 | -0.345146 | -0.397901 | -0.429121 | -0.103626 | -0.306055 | -0.367277 | -0.398793 | -0.187171 | -0.152370 | -0.307992 | 0.0 | -0.165751 | 2.850972 | -0.284834 | -0.315194 | -0.166115 | -0.224676 | -0.204051 | 0.112278 | -0.216127 | -0.708813 | -0.904008 | -0.841863 | -0.636077 | -0.530001 | -0.307769 | -0.356159 | -0.277974 | -0.337643 | -0.281795 | -0.190088 | 0.213340 | -0.351276 | -0.344829 | -0.430833 | -0.289242 | -0.523710 | -0.515336 | -0.371494 | -0.430986 | 0.004377 | -0.312114 | -0.419849 | -1.237572 | -0.293179 | -0.41431 | -0.280412 | -0.396945 | -0.366103 | -0.481483 | 0.499941 | -0.389686 | -0.212023 | -0.492124 | -0.439558 | -1.389498 | 0.423835 | -1.033241 | -0.657165 | -0.470776 | -0.326582 | -0.245369 | 2.377728 | -0.344064 | -0.234851 | -0.455492 | -0.743385 | -0.591064 | -0.461377 | -0.424651 | 0.085186 |
| 4 | 0.123917 | 1.488731 | -0.434834 | -0.309217 | -0.345146 | -0.267435 | -0.487123 | -0.327195 | -0.306055 | 1.883568 | -0.180792 | -0.187171 | -0.152370 | 0.066736 | 0.0 | -0.335453 | -0.543064 | -0.357595 | 4.106433 | 0.373158 | 0.092549 | -0.204051 | -0.229021 | -0.147439 | 0.086359 | 0.291521 | 0.077524 | 0.711676 | -0.530001 | -0.307769 | -0.356159 | -0.277974 | -0.337643 | -0.281795 | -0.190088 | -0.671160 | -0.351276 | -0.344829 | 0.982048 | -0.223345 | 0.857184 | -0.515336 | 1.814744 | -0.430986 | -0.298008 | -0.312114 | -0.427126 | -0.608264 | -0.293179 | -0.41431 | -0.306304 | -0.396945 | -0.366103 | -0.481483 | -0.450335 | -0.457227 | -0.212023 | -0.492124 | -0.716571 | -0.615758 | -0.020942 | -0.380617 | 0.580430 | -0.470776 | -0.326582 | -0.245369 | -0.727016 | -0.344064 | -0.391985 | -0.455492 | -0.743385 | -0.775134 | -0.477332 | -0.424651 | -0.588509 |
# Fetch ids for kaggle upload
kaggle_id = kaggle['id']
# Model building with PCA
kaggle_pred_pca = pca_final.transform(kaggle_pred)
log1_pred_kaggle = log1_pca_model.predict(kaggle_pred_pca)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":log1_pred_kaggle})
result.to_csv('Submission_log.csv', header=True,index=False)
svm1_pred_kaggle = svm1_pca_model.predict(kaggle_pred_pca)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":svm1_pred_kaggle})
result.to_csv('Submission_svm.csv', header=True,index=False)
#4 Random Forest
rf_pred_kaggle = rf3_pca_model.predict(kaggle_pred_pca)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":rf_pred_kaggle})
result.to_csv('Submission_rf.csv', header=True,index=False)
#4 XGB
xgb_pred_kaggle = xgb1_pca_model.predict(kaggle_pred_pca)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":xgb_pred_kaggle})
result.to_csv('Submission_xgb.csv', header=True,index=False)
rfc1_pred_kaggle = rfc1_model.predict(kaggle_pred)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":rfc1_pred_kaggle})
result.to_csv('Submission_rf_1.csv', header=True,index=False)
rfc2_pred_kaggle = rfc2_model.predict(kaggle_pred)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":rfc2_pred_kaggle})
result.to_csv('Submission_rf_2.csv', header=True,index=False)
rfc3_pred_kaggle = rfc3_model.predict(kaggle_pred)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":rfc3_pred_kaggle})
result.to_csv('Submission_rf_4.csv', header=True,index=False)
rfc4_pred_kaggle = rfc4_model.predict(kaggle_pred)
result = pd.DataFrame({"id":kaggle_id,"churn_probability":rfc4_pred_kaggle})
result.to_csv('Submission_rf_5.csv', header=True,index=False)